tuandunghcmut commited on Apr 10, 2025

Commit

7d9e5ac

verified ·

1 Parent(s): b02e3a6

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Groma/pyproject.toml +32 -0
InternVL/.gitignore +171 -0
InternVL/.isort.cfg +26 -0
InternVL/INSTALLATION.md +69 -0
InternVL/requirements.txt +4 -0
LLM2CLIP/FAQ.md +133 -0
LLM2CLIP/LICENSE +21 -0
LLM2CLIP/SUPPORT.md +25 -0
LLaVA/.dockerignore +21 -0
LLaVA/.editorconfig +18 -0
LLaVA/LICENSE +201 -0
LLaVA/cog.yaml +37 -0
LLaVA/pyproject.toml +37 -0
OpenSeeD/README.md +77 -0
OpenSeeD/__init__.py +0 -0
OpenSeeD/requirements.txt +30 -0
Ovis/README.md +110 -0
PaddleMIX/.copyright.hook +134 -0
PaddleMIX/.style.yapf +3 -0
PaddleMIX/LICENSE +201 -0
PaddleMIX/README_EN.md +390 -0
PaddleMIX/VERSION +1 -0
PaddleMIX/check_env.sh +101 -0
PaddleMIX/pyproject.toml +23 -0
PaddleMIX/requirements.txt +15 -0
VILA/LongVILA.md +79 -0
VILA/convert_ckpt.py +91 -0
VILA/environment_setup.sh +33 -0
VILA/predict.py +189 -0
VLMEvalKit/.pre-commit-config.yaml +30 -0
VLMEvalKit/requirements.txt +30 -0
a_distributed_notebook/FSDP_tutorial.md +519 -0
a_distributed_notebook/temp/all_gather.py +116 -0
a_distributed_notebook/temp/run_4.py +63 -0
a_main_folder/convert_hf_dataset.ipynb +0 -0
a_temp/deepseek_vl2.ipynb +0 -0
a_temp/docs.html +32 -0
a_temp/example_image.jpg +0 -0
a_temp/openapi.json +1 -0
a_temp/temp1.ipynb +330 -0
a_temp/vllm_example.sh +412 -0
groundingLMM/train.py +671 -0
lightning-hydra-template/.github/codecov.yml +15 -0
lightning-hydra-template/.github/workflows/test.yml +139 -0
lightning-hydra-template/configs/__init__.py +1 -0
lightning-hydra-template/configs/local/.gitkeep +0 -0
lightning-hydra-template/configs/train.yaml +49 -0
lightning-hydra-template/logs/.gitkeep +0 -0
lightning-hydra-template/tests/test_datamodules.py +38 -0
lightning-hydra-template/tests/test_eval.py +39 -0

Groma/pyproject.toml ADDED Viewed

	@@ -0,0 +1,32 @@

+[build-system]
+requires = ["setuptools<67.0.0,>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "groma"
+version = "1.0.0"
+description = "Grounded Multimodal Large Language Models."
+readme = "README.md"
+requires-python = ">=3.9"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+]
+dependencies = [
+    "einops", "fastapi", "gradio==3.23", "markdown2[all]", "numpy",
+    "requests", "sentencepiece", "tokenizers==0.12.1",
+    "uvicorn", "shortuuid", "scipy", "pycocotools", "pycocoevalcap",
+    "deepspeed==0.9.2", "peft==0.3.0", "terminaltables", "transformers==4.32.0",
+    "bitsandbytes==0.43.1",
+    "lvis @ git+https://github.com/lvis-dataset/lvis-api.git",
+    "accelerate @ git+https://github.com/huggingface/accelerate@a2d8f540c3ab37c8f84d616be1300a0572b69cf8"
+]
+[project.urls]
+"Homepage" = "https://groma-mllm.github.io/"
+[tool.setuptools.packages.find]
+exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
+[tool.wheel]
+exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]

InternVL/.gitignore ADDED Viewed

	@@ -0,0 +1,171 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+.idea/
+.DS_Store
+data_process/
+internvl_chat/work_dirs/
+internvl_chat/unittest/
+internvl_chat/data/
+Husky2/*
+data_process/
+*distillation*

InternVL/.isort.cfg ADDED Viewed

	@@ -0,0 +1,26 @@

+[isort]
+line-length = 180
+multi_line_output = 0
+extra_standard_library = setuptools
+known_third_party = PIL,asynctest,cityscapesscripts,cv2,gather_models,matplotlib,mmcv,numpy,onnx,onnxruntime,pycocotools,pytest,pytorch_sphinx_theme,requests,scipy,seaborn,six,terminaltables,torch,ts,yaml
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = THIRDPARTY
+[yapf]
+BASED_ON_STYLE = pep8
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
+SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
+[codespell]
+skip = *.ipynb
+quiet-level = 3
+ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids,TOOD,tood
+© 2022 GitHub, Inc.
+Terms
+Privacy
+Security
+Status
+Docs
+Contact GitHub
+Pricing
+API

InternVL/INSTALLATION.md ADDED Viewed

	@@ -0,0 +1,69 @@

+## 🛠️ Installation
+- Clone this repository:
+  ```bash
+  git clone https://github.com/OpenGVLab/InternVL.git
+  ```
+- Create a conda virtual environment and activate it:
+  ```bash
+  conda create -n internvl python=3.9 -y
+  conda activate internvl
+  ```
+- Install dependencies using `requirements.txt`:
+  ```bash
+  pip install -r requirements.txt
+  ```
+  By default, our `requirements.txt` file includes the following dependencies:
+  - `-r requirements/internvl_chat.txt`
+  - `-r requirements/streamlit_demo.txt`
+  - `-r requirements/classification.txt`
+  - `-r requirements/segmentation.txt`
+  The `clip_benchmark.txt` is **not** included in the default installation. If you require the `clip_benchmark` functionality, please install it manually by running the following command:
+  ```bash
+  pip install -r requirements/clip_benchmark.txt
+  ```
+### Additional Instructions
+- Install `flash-attn==2.3.6`:
+  ```bash
+  pip install flash-attn==2.3.6 --no-build-isolation
+  ```
+  Alternatively you can compile from source:
+  ```bash
+  git clone https://github.com/Dao-AILab/flash-attention.git
+  cd flash-attention
+  git checkout v2.3.6
+  python setup.py install
+  ```
+- Install `mmcv-full==1.6.2` (optional, for `segmentation`):
+  ```bash
+  pip install -U openmim
+  mim install mmcv-full==1.6.2
+  ```
+- Install `apex` (optional, for `segmentation`):
+  ```bash
+  git clone https://github.com/NVIDIA/apex.git
+  git checkout 2386a912164b0c5cfcd8be7a2b890fbac5607c82  # https://github.com/NVIDIA/apex/issues/1735
+  pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+  ```
+  If you encounter `ModuleNotFoundError: No module named 'fused_layer_norm_cuda'`, it is because apex's CUDA extensions are not being installed successfully. You can try uninstalling apex and the code will default to the PyTorch version of RMSNorm. Alternatively, if you prefer using apex, try adding a few lines to `setup.py` and then recompiling.
+  <img src=https://github.com/OpenGVLab/InternVL/assets/23737120/c04a989c-8024-49fa-b62c-2da623e63729 width=50%>

InternVL/requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+-r requirements/internvl_chat.txt
+-r requirements/streamlit_demo.txt
+-r requirements/classification.txt
+-r requirements/segmentation.txt

LLM2CLIP/FAQ.md ADDED Viewed

	@@ -0,0 +1,133 @@

+# Selected Representative Q&A
+## Q1:
+> **Q: It is foreseeable that the technology of LLM2CLIP will be of great significance in expanding CLIP's support for more modal data. As far as the article is concerned, LLM2CLIP has surprisingly improved CLIP's adaptability to cross-language and long text tasks. At the same time, it also proposes application possibilities for higher-dimensional data modalities such as audio and video. Of course, this puts forward further requirements for LLM2CLIP's adaptation strategy and fine-tuning methods. Based on your team's current understanding of LLM2CLIP, what additional challenges will arise, for example, the feature space alignment problem of high-dimensional modalities?**
+![A1](https://via.placeholder.com/15/blue/000000?text=+) **A:** To be honest, we’re already exploring a video-based version of LLM2CLIP, including scaling up both the dataset size and model parameters by several orders of magnitude. Please stay tuned for our future updates, and if you’re interested, we’d be happy to discuss this further!
+Here are some additional challenges I see in this area:
+1. **Enhancing the Supervisory Signal in Contrastive Learning:** While LLMs have a strong capability to understand text, providing valuable and rich textual information is equally critical. For instance, for video tasks, we could enrich the input with denser captions, prompts, or instructions. These could provide more complex and detailed information for the LLM to interpret, thereby enabling it to better guide the construction of the cross-modal space.
+2. **Expanding Contrastive Learning Loss Across Dimensions:** Contrastive learning losses can be applied across various dimensions, such as the temporal dimension in video data. Different prompts provided to the LLM could be designed to guide and control the training process in these additional dimensions, further strengthening the multimodal representations.
+3. **Tackling Complex Temporal Logic in Videos:** The challenges in video understanding often involve designing solutions for complex temporal relationships over extended time spans. Here, we could incorporate self-play techniques using the LLM to introduce tasks and increase the complexity of the training objectives. This might involve designing scenarios where the LLM can simulate and reason about sequences, further enhancing its learning.
+## Q2:
+> **Q: What a groundbreaking paper on LLM2CLIP! The innovative integration of large language models with CLIP to enhance cross-modal representation learning is truly inspiring. The performance improvements demonstrated, particularly in long-text and short-text retrieval tasks, are impressive and have significant implications for the field of multimodal AI.**
+>
+> **My admiration for your work encourages me to inquire about the potential applications of LLM2CLIP in more specialized domains, such as medicine or law, where the precision and expertise of textual understanding are paramount. Therefore, I am curious to know if LLM2CLIP has been tested or if there are plans to test it with domain-specific texts that require a high degree of accuracy and proficiency.**
+>
+> Looking forward to your insights on this matter and how LLM2CLIP might be adapted or extended to meet the challenges of these specialized fields!
+>
+![A2](https://via.placeholder.com/15/green/000000?text=+) **A:** Your idea is fantastic, and in fact, we have had similar thoughts. I believe there is significant potential in working on specialized fields, and here are my reasons:
+1. **Limited Data, High Impact:** Our work focuses on fine-tuning pre-trained CLIP models with very limited data for LLM2CLIP, ranging from 3M to 60M. Compared to the 1-2B data commonly used in CLIP pre-training, this is a small amount, yet it has already demonstrated substantial performance improvements. If we focus on specialized fields, we could leverage limited domain-specific data to train the model exceptionally well in a specific knowledge area. This approach could potentially resolve issues like perception or cognition hallucinations in related multimodal domains entirely.
+2. **Leveraging LLM Knowledge as Data Augmentation:** Certain specialized fields, such as medical reports, often suffer from a lack of data. Here, the knowledge encoded in LLMs can serve as an excellent data augmenter due to their access to open-world knowledge over time.
+We look forward to collaborating with you to push the boundaries of multimodal domains!
+BTW, we plan to release scaled-up LLM2CLIP models (10-100x larger) next quarter. These models will inherit our general-purpose parameters, potentially making them even more powerful. Please stay tuned to our GitHub!
+## Q3:
+> **Q: Thank you so much for such an outstanding work. I have a couple of questions regarding the fine-tuning process described in Section 3.2, particularly around the integration of loss functions and datasets:**
+>
+> **In the paper, two loss functions are mentioned: SimCSE loss and Masked Next Token Prediction (MNTP). However, it is unclear whether these two loss functions are used simultaneously during training, or if the training process is split into different phases where each loss is applied separately. Could you please clarify how the losses are used? If they are used together, what are the relative weights assigned to each?**
+>
+> **Regarding the datasets, CC-3M and Wikitext-103 are mentioned as part of the training process. It seems a bit unclear how these two datasets are combined in the training phase. Given that Wikitext-103 is a pure language corpus while CC-3M is image-caption based, how are they jointly used during the fine-tuning process? Are they used for different stages or tasks?**
+>
+> Looking forward to your insights on this!
+>
+![A3](https://via.placeholder.com/15/red/000000?text=+) **A:** Thank you for your question. I’m glad to clarify.
+**Loss Functions Integration:** We use the supervised SimCSE loss to make different captions of the same image positive samples for each other, while captions of different images serve as negative samples. This loss function is key to our method, allowing the LLM to provide meaningful supervisory signals to the image. However, the Masked Next Token Prediction (MNTP) was an initial stage we employed before using the supervised SimCSE loss; it can be understood as an earlier step in training. We first conduct MNTP, followed by supervised SimCSE loss, in a two-stage process. In practice, MNTP has little impact on the results, so removing it does not affect the conclusions. However, for optimal performance, we still chose to use MNTP before applying supervised SimCSE loss.
+**Dataset Combination:** We indeed mix both pure text and caption datasets. This is because the LLM is initially pre-trained on pure text data, so we aim to retain its original distribution with minimal shift by using the pure text dataset Wikitext-103, which also helps mitigate any bias introduced by captions. Our approach is to mix and shuffle the two datasets and then sample batches normally for training. This is a common and effective practice.
+If you have more questions, please feel free to ask.
+## Q4:
+> **Q: LLM2CLIP does not bring out significant improvements on ImageNet-1k only or all these zero-shot benchmarks?**
+>
+> **Have you ever measured the average caption length between your method and vanilla EVA-02-CLIP? In my opinion, longer text captions do not always bring out improvements.**
+>
+> **It's reasonable to improve the performances of VLMs on the SQA and Wizwiz benchmarks while it's strange to drop the performances on the fundamental benchmarks such as MME.**
+![A4](https://via.placeholder.com/15/purple/000000?text=+) **A:** We haven’t specifically tested it, and the improvement on ImageNet is indeed not very noticeable. With OpenAI’s CLIP, we can achieve about a one-point improvement, which is relatively modest compared to other retrieval tasks. My guess is that we used a large amount of dense captions, which may cause the model to favor more complex text. However, we have found in experiments that ImageNet performance is strongly correlated with data volume, possibly related to the word distribution used during alignment. We only used 15 million data points for the alignment in LLM fine-tuning. In the next version, we’ll increase the training data for LLM2CLIP by tens of times, so we plan to re-evaluate it then.
+The improvement of long captions or dense captions for CLIP is quite limited. Works like LongCLIP (https://arxiv.org/abs/2403.15378) and DCI (https://arxiv.org/abs/2312.08578) specifically address this issue. The problem here is that the original CLIP text encoder lacks the ability to understand such information or handle captions of this length. However, LLM2CLIP, even when trained on a fully short-text dataset, still demonstrates outstanding and leading performance, as shown in Table 5 of the paper.
+## Q5:
+> **Q: Hello!**
+>
+> **I am very interested in your work, and I encountered some issues during the reproduction process.**
+>
+> **How can I replace the original text encoder with the tuned Llama 3 model? I checked the config file LLM2CLIP-EVA02-L-14-336/configuration_evaclip.py, and I noticed that the model parameters for the text encoder remain the same as those in the original CLIP model. This is a bit confusing to me.**
+>
+> **If I’m correct, is the run.sh script provided for training CLIP with a frozen Llama 3 encoder?**
+>
+> Looking forward for your reply!
+>
+![A5](https://via.placeholder.com/15/orange/000000?text=+) **A:** We have updated the caption contrastive fine-tuned version of Llama3-8B-CC (https://huggingface.co/microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned) to assist with your retrieval experiments and training of your own CLIP models. Additionally, the parameters for our adapter and projector have been made available in our OpenAI ViT-L repository (https://huggingface.co/microsoft/LLM2CLIP-Openai-L-14-336). The retrieval testing methods are documented in the model card for reference.
+Our tests show retrieval performance exceeding the results reported in the paper, and we encourage you to try it out.
+Regarding the EVA series of models, there have been precision mismatches during the conversion to Hugging Face, which are currently being fixed. Updates will be released progressively.
+Furthermore, we will provide detailed instructions on how to use LLM2CLIP to fine-tune your own CLIP models in about a week—please stay tuned!
+## Q6:
+> **Q: Hello!**
+>
+> **I am very interested in your work, and I encountered some issues during the reproduction process.**
+>
+> **How can I replace the original text encoder with the tuned Llama 3 model? I checked the config file LLM2CLIP-EVA02-L-14-336/configuration_evaclip.py, and I noticed that the model parameters for the text encoder remain the same as those in the original CLIP model. This is a bit confusing to me.**
+>
+> **If I’m correct, is the run.sh script provided for training CLIP with a frozen Llama 3 encoder?**
+>
+> Looking forward for your reply!
+>
+![A6](https://via.placeholder.com/15/orange/000000?text=+) **A:** We have updated the caption contrastive fine-tuned version of Llama3-8B-CC (https://huggingface.co/microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned) to assist with your retrieval experiments and training of your own CLIP models. Additionally, the parameters for our adapter and projector have been made available in our OpenAI ViT-L repository (https://huggingface.co/microsoft/LLM2CLIP-Openai-L-14-336). The retrieval testing methods are documented in the model card for reference.
+Our tests show retrieval performance exceeding the results reported in the paper, and we encourage you to try it out.
+Regarding the EVA series of models, there have been precision mismatches during the conversion to Hugging Face, which are currently being fixed. Updates will be released progressively.
+Furthermore, we will provide detailed instructions on how to use LLM2CLIP to fine-tune your own CLIP models in about a week—please stay tuned!
+>
+## Q6:
+> **Q: I find the LLM2CLIP approach inspiring as it leverages large language models (LLMs) to enhance cross-modal representation learning. The integration of fine-tuned LLMs as a textual encoder offers substantial improvements over traditional CLIP models. However, I have a few questions and suggestions regarding the methodology and evaluation:**
+>
+> **While the paper highlights the efficiency of training using LoRA and freezing LLM gradients, scaling to datasets larger than the 60M configuration or involving multilingual captions could introduce challenges. Could you elaborate on the computational implications if fine-tuning were performed without freezing the LLM gradients?**
+>
+> **The contrastive fine-tuning strategy for improving feature discriminability is innovative. However, as mentioned, dense captions from ShareCaptioner may introduce noise or distribution mismatches. Have you explored the impact of using alternative caption-generation methods or real-world noisy datasets?**
+>
+> **The use of various datasets like DOCCI and ShareGPT4V provides comprehensive evaluations. However, benchmarks focusing on event understanding, video context, or temporal dependencies could further validate the model's capabilities in real-world multimodal tasks.**
+>
+> **Overall, LLM2CLIP presents a significant advancement in multimodal learning, setting a foundation for future enhancements in cross-modal representation tasks.**
+![A6](https://via.placeholder.com/15/orange/000000?text=+) **A:** We opened the latter layers of the network based on the GPU memory we could accommodate but did not observe significant performance improvements, so we decided not to continue this way. CLIP training relies heavily on batch size, and opening the LLM would compromise the batch size, which could have a negative impact. Additionally, keeping the LLM fixed is actually quite reasonable since our goal is to align the visual model with the correct textual modality. Now that we have access to more abundant computational resources, we plan to conduct more experiments in this area to provide answers for the community.
+We have tried the Recaption-1B dataset (https://github.com/UCSC-VLAA/Recap-DataComp-1B) labeled using Llava 1.5, but its performance was not as good as ShareCaptioner 4V. Real-world noisy datasets essentially align with the conclusion in Table 5 of our paper, specifically the 0% short caption results, which show that they underperform compared to using VLLMs for recaptioning. In our next version, we plan to incorporate a large volume of GPT-4o recaptioned results—please stay tuned!
+Thank you for your excellent suggestions. Do you have any specific benchmarks you would recommend? We’d be happy to test them.
+We truly appreciate your recognition and look forward to contributing more valuable models and knowledge to the community in the future.
+## Q7:
+> **Q: This is a really interesting paper that presents a compelling approach to improving visual representation learning by effectively integrating the power of LLMs with CLIP. The entire paper feels well motivated, thoroughly researched, and clearly presented - a truly excellent contribution to the field!**
+>
+> **I am a bit curious that given the importance of CLIP in guiding the image generation process of diffusion models, and the enhancement of CLIP's image-text understanding capabilities by LLM2CLIP demonstrated in the paper, can integrating LLM2CLIP into the training and inference of a diffusion model bring a boost in the text-to-image domain? For example, FLUX and Stable Diffusion 3 series show significant improvement in following natural language prompts than previous diffusion models, and I think LLM2CLIP will bring further improvements.**
+>
+> **Thank you for your innovative work and significant contribution to the field of multimodal learning!**
+![A7](https://via.placeholder.com/15/teal/000000?text=+) **A:** Yes, we have also considered that incorporating LLM2CLIP into image-text generative models could enable more complex and precise control, and we believe there is great potential in this direction. In fact, we’ve already conducted some initial experiments, which indicate that LLM2CLIP’s llama3 performs significantly better than a standard llama3 when simply integrated with Stable Diffusion 3. However, we haven’t had the chance to explore this further in depth yet. We might delve into this more thoroughly in the future. Thank you for recognizing our work!

LLM2CLIP/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+    MIT License
+    Copyright (c) Microsoft Corporation.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE

LLM2CLIP/SUPPORT.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# TODO: The maintainer of this repo has not yet edited this file
+**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
+- **No CSS support:** Fill out this template with information about how to file issues and get help.
+- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
+- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
+*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
+# Support
+## How to file issues and get help
+This project uses GitHub Issues to track bugs and feature requests. Please search the existing
+issues before filing new issues to avoid duplicates.  For new issues, file your bug or
+feature request as a new Issue.
+For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
+FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
+CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
+## Microsoft Support Policy
+Support for this **PROJECT or PRODUCT** is limited to the resources listed above.

LLaVA/.dockerignore ADDED Viewed

	@@ -0,0 +1,21 @@

+# The .dockerignore file excludes files from the container build process.
+#
+# https://docs.docker.com/engine/reference/builder/#dockerignore-file
+# Exclude Git files
+.git
+.github
+.gitignore
+# Exclude Python cache files
+__pycache__
+.mypy_cache
+.pytest_cache
+.ruff_cache
+# Exclude Python virtual environment
+/venv
+# Exclude some weights
+/openai
+/liuhaotian

LLaVA/.editorconfig ADDED Viewed

	@@ -0,0 +1,18 @@

+root = true
+# Unix-style newlines with a newline ending every file
+[*]
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+charset = utf-8
+# 4 space indentation
+[*.{py,json}]
+indent_style = space
+indent_size = 4
+# 2 space indentation
+[*.{md,sh,yaml,yml}]
+indent_style = space
+indent_size = 2

LLaVA/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

LLaVA/cog.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+build:
+  gpu: true
+  python_version: "3.11"
+  python_packages:
+    - "torch==2.0.1"
+    - "accelerate==0.21.0"
+    - "bitsandbytes==0.41.0"
+    - "deepspeed==0.9.5"
+    - "einops-exts==0.0.4"
+    - "einops==0.6.1"
+    - "gradio==3.35.2"
+    - "gradio_client==0.2.9"
+    - "httpx==0.24.0"
+    - "markdown2==2.4.10"
+    - "numpy==1.26.0"
+    - "peft==0.4.0"
+    - "scikit-learn==1.2.2"
+    - "sentencepiece==0.1.99"
+    - "shortuuid==1.0.11"
+    - "timm==0.6.13"
+    - "tokenizers==0.13.3"
+    - "torch==2.0.1"
+    - "torchvision==0.15.2"
+    - "transformers==4.31.0"
+    - "wandb==0.15.12"
+    - "wavedrom==2.0.3.post3"
+    - "Pygments==2.16.1"
+  run:
+    - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget
+# predict.py defines how predictions are run on your model
+predict: "predict.py:Predictor"

LLaVA/pyproject.toml ADDED Viewed

	@@ -0,0 +1,37 @@

+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "llava"
+version = "1.2.2.post1"
+description = "Towards GPT-4 like large language and visual assistant."
+readme = "README.md"
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+]
+dependencies = [
+    "torch==2.1.2", "torchvision==0.16.2",
+    "transformers==4.37.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid",
+    "accelerate==0.21.0", "peft", "bitsandbytes",
+    "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2",
+    "gradio==4.16.0", "gradio_client==0.8.1",
+    "requests", "httpx==0.24.0", "uvicorn", "fastapi",
+    "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13",
+]
+[project.optional-dependencies]
+train = ["deepspeed==0.12.6", "ninja", "wandb"]
+build = ["build", "twine"]
+[project.urls]
+"Homepage" = "https://llava-vl.github.io"
+"Bug Tracker" = "https://github.com/haotian-liu/LLaVA/issues"
+[tool.setuptools.packages.find]
+exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
+[tool.wheel]
+exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]

OpenSeeD/README.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# OpenSeeD
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/a-simple-framework-for-open-vocabulary/panoptic-segmentation-on-coco-minival)](https://paperswithcode.com/sota/panoptic-segmentation-on-coco-minival?p=a-simple-framework-for-open-vocabulary)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/a-simple-framework-for-open-vocabulary/panoptic-segmentation-on-ade20k-val)](https://paperswithcode.com/sota/panoptic-segmentation-on-ade20k-val?p=a-simple-framework-for-open-vocabulary)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/a-simple-framework-for-open-vocabulary/instance-segmentation-on-ade20k-val)](https://paperswithcode.com/sota/instance-segmentation-on-ade20k-val?p=a-simple-framework-for-open-vocabulary)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/a-simple-framework-for-open-vocabulary/instance-segmentation-on-cityscapes-val)](https://paperswithcode.com/sota/instance-segmentation-on-cityscapes-val?p=a-simple-framework-for-open-vocabulary)
+This is the official implementation of the paper "[A Simple Framework for Open-Vocabulary Segmentation and Detection](https://arxiv.org/pdf/2303.08131.pdf)".
+https://user-images.githubusercontent.com/34880758/225408795-d1e714e0-cfc8-4466-b052-045d54409a1d.mp4
+You can also find the more detailed demo at [video link on Youtube](https://www.youtube.com/watch?v=z4gsQw2n7iM).
+:point_right: **[New] demo code is available**
+:point_right: **[New] OpenSeeD has been accepted to ICCV 2023! training code is available!**
+### :rocket: Key Features
+- A Simple Framework for Open-Vocabulary Segmentation and Detection.
+- Support interactive segmentation with box input to generate mask.
+### :bulb: Installation
+```sh
+pip3 install torch==1.13.1 torchvision==0.14.1 --extra-index-url https://download.pytorch.org/whl/cu113
+python -m pip install 'git+https://github.com/MaureenZOU/detectron2-xyz.git'
+pip install git+https://github.com/cocodataset/panopticapi.git
+python -m pip install -r requirements.txt
+export DATASET=/pth/to/dataset
+```
+Download the pretrained checkpoint from [here](https://github.com/IDEA-Research/OpenSeeD/releases/download/openseed/model_state_dict_swint_51.2ap.pt).
+### :bulb: Demo script
+```sh
+python demo/demo_panoseg.py evaluate --conf_files configs/openseed/openseed_swint_lang.yaml  --image_path images/animals.png --overrides WEIGHT /path/to/ckpt/model_state_dict_swint_51.2ap.pt
+```
+:fire: Remember to **modify the vocabulary**  `thing_classes` and `stuff_classes` in `demo_panoseg.py`  if your want to segment open-vocabulary objects.
+**Evaluation on coco**
+```sh
+python train_net.py --original_load --eval_only --num-gpus 8 --config-file configs/openseed/openseed_swint_lang.yaml MODEL.WEIGHTS=[/path/to/lang/weight](https://github.com/IDEA-Research/OpenSeeD/releases/download/openseed/model_state_dict_swint_51.2ap.pt)
+```
+You are expected to get `55.4` PQ.
+### :bulb: Some coco-format data
+Here is the coco-format json file for evaluating [BDD](https://github.com/IDEA-Research/OpenSeeD/releases/download/bdd_val_data/coco_val.json) and [SUN](https://github.com/IDEA-Research/OpenSeeD/releases/tag/sun_data).
+### Training OpenSeeD baseline
+**Training on coco**
+```sh
+python train_net.py --num-gpus 8 --config-file configs/openseed/openseed_swint_lang.yaml --lang_weight [/path/to/lang/weight](https://github.com/IDEA-Research/OpenSeeD/releases/download/training/model_state_dict_only_language.pt)
+```
+**Training on coco+o365**
+```sh
+python train_net.py --num-gpus 8 --config-file configs/openseed/openseed_swint_lang_o365.yaml --lang_weight [/path/to/lang/weight](https://github.com/IDEA-Research/OpenSeeD/releases/download/training/model_state_dict_only_language.pt)
+```
+### Checkpoints
+- Swin-T model trained on COCO panoptic segmentation and Objects365 [weights](https://github.com/IDEA-Research/OpenSeeD/releases/tag/ckpt_swint_coco_o365).
+- Swin-L model fine-tuned on COCO panoptic segmentation [weights](https://github.com/IDEA-Research/OpenSeeD/releases/tag/coco_pano_sota_swinl).
+- Swin-L model fine-tuned on ADE20K semantic segmentation [weights](https://github.com/IDEA-Research/OpenSeeD/releases/tag/ade20k_swinl).
+![hero_figure](figs/intro.jpg)
+### :unicorn: Model Framework
+![hero_figure](figs/framework.jpg)
+### :volcano: Results
+Results on open segmentation
+![hero_figure](figs/results1.jpg)
+Results on task transfer and segmentation in the wild
+![hero_figure](figs/results2.jpg)
+### <a name="CitingOpenSeeD"></a>Citing OpenSeeD
+If you find our work helpful for your research, please consider citing the following BibTeX entry.
+```BibTeX
+@article{zhang2023simple,
+  title={A Simple Framework for Open-Vocabulary Segmentation and Detection},
+  author={Zhang, Hao and Li, Feng and Zou, Xueyan and Liu, Shilong and Li, Chunyuan and Gao, Jianfeng and Yang, Jianwei and Zhang, Lei},
+  journal={arXiv preprint arXiv:2303.08131},
+  year={2023}
+}
+```

OpenSeeD/__init__.py ADDED Viewed

File without changes

OpenSeeD/requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+torch
+torchvision
+opencv-python
+pyyaml
+json_tricks
+yacs
+scikit-learn
+pandas
+timm==0.4.12
+numpy==1.23.5
+einops
+fvcore
+transformers==4.19.2
+sentencepiece
+ftfy
+regex
+nltk
+vision-datasets==0.2.2
+pycocotools==2.0.4
+diffdist
+pyarrow
+cityscapesscripts
+shapely
+scikit-image
+mup
+gradio==3.13.0
+scann
+kornia==0.6.4
+torchmetrics==0.6.0
+mpi4py

Ovis/README.md ADDED Viewed

	@@ -0,0 +1,110 @@

+# Ovis: Structural Embedding Alignment for Multimodal Large Language Model
+Ovis (Open VISion) is a novel Multimodal Large Language Model (MLLM) architecture, designed to structurally align visual and textual embeddings. For a comprehensive introduction, please refer to the [Ovis paper](https://arxiv.org/abs/2405.20797).
+<div style="text-align: center;">
+  <img style="max-width: 100%;" src="docs/ovis-illustration.png" alt="Ovis Illustration"/>
+</div>
+## Release
+- [11/26] 🔥 Announcing [Ovis1.6-Gemma2-27B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-27B)!
+- [11/04] 🔥 Announcing quantized versions of Ovis1.6: [Ovis1.6-Gemma2-9B-GPTQ-Int4](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B-GPTQ-Int4) and [Ovis1.6-Llama3.2-3B-GPTQ-Int4](https://huggingface.co/AIDC-AI/Ovis1.6-Llama3.2-3B-GPTQ-Int4)!
+- [10/22] 🔥 Announcing Ovis1.6-Llama3.2-3B ([Model](https://huggingface.co/AIDC-AI/Ovis1.6-Llama3.2-3B), [Demo](https://huggingface.co/spaces/AIDC-AI/Ovis1.6-Llama3.2-3B))!
+- [09/19] 🔥 Announcing Ovis1.6-Gemma2-9B ([Model](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B), [Demo](https://huggingface.co/spaces/AIDC-AI/Ovis1.6-Gemma2-9B))! This latest release further enhances high-resolution image processing, is trained on a larger, more diverse, and higher-quality dataset, and refines the training process with DPO training following instruction-tuning.
+- [07/24] 🔥 Introducing Ovis1.5, featuring improved high-resolution image processing and optimized training data for enhanced performance.
+- [06/14] 🔥 Launch of Ovis1.0, the inaugural version of the Ovis model.
+## Contents
+- [Install](#install)
+- [Model](#model)
+- [Performance](#performance)
+- [Finetune](#finetune)
+- [Inference](#inference)
+- [Quantization](#quantization)
+- [Citation](#citation)
+- [Team](#team)
+- [License](#license)
+## Install
+Ovis has been tested with Python 3.10, Torch 2.4.0, Transformers 4.46.2, and DeepSpeed 0.15.4. For a comprehensive list of package dependencies, please consult the `requirements.txt` file. Before finetuning or inference, please install Ovis as follows.
+```bash
+git clone git@github.com:AIDC-AI/Ovis.git
+conda create -n ovis python=3.10 -y
+conda activate ovis
+cd Ovis
+pip install -r requirements.txt
+pip install -e .
+```
+## Model
+Ovis can be instantiated with popular LLMs. We provide the following Ovis MLLMs:
+| Ovis MLLMs        | ViT         | LLM                |                          Model Weights                          | Demo                                                             |
+|:------------------|:-----------:|:------------------:|:---------------------------------------------------------------:|:----------------------------------------------------------------:|
+| Ovis1.6-Gemma2-27B | Siglip-400M | Gemma2-27B-It       | [Huggingface](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-27B) | - |
+| Ovis1.6-Gemma2-9B | Siglip-400M | Gemma2-9B-It       | [Huggingface](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B) | [Space](https://huggingface.co/spaces/AIDC-AI/Ovis1.6-Gemma2-9B) |
+| Ovis1.6-Llama3.2-3B | Siglip-400M | Llama-3.2-3B-Instruct       | [Huggingface](https://huggingface.co/AIDC-AI/Ovis1.6-Llama3.2-3B) | [Space](https://huggingface.co/spaces/AIDC-AI/Ovis1.6-Llama3.2-3B) |
+## Performance
+With **29B** parameters, **Ovis1.6-Gemma2-27B** achieves exceptional performance in the [OpenCompass](https://github.com/open-compass/VLMEvalKit) benchmark, ranking among the top-tier open-source MLLMs.
+![performance-Ovis1_6-Gemma2-27B](docs/performance/Ovis1_6-Gemma2-27B.png)
+With just **10B** parameters, **Ovis1.6-Gemma2-9B** leads the [OpenCompass](https://github.com/open-compass/VLMEvalKit) benchmark among open-source MLLMs within **30B** parameters.
+![performance-Ovis1_6-Gemma2-9B](docs/performance/Ovis1_6-Gemma2-9B.png)
+**Ovis1.6-Llama3.2-3B** leads the [OpenCompass](https://github.com/open-compass/VLMEvalKit) benchmark among open-source MLLMs under **4B** parameters, even surpassing Llama-3.2-11B-Vision-Instruct.
+![performance-Ovis1_6-Llama3_2-3B](docs/performance/Ovis1_6-Llama3_2-3B.png)
+## Finetune
+Finetuning Ovis1.6-Gemma2-9B is supported in [ms-swift](https://github.com/modelscope/ms-swift).
+## Inference
+We provide an inference wrapper in `ovis/serve/runner.py`, which can be used as:
+```python
+from PIL import Image
+from ovis.serve.runner import RunnerArguments, OvisRunner
+image = Image.open('temp.png')
+text = 'PROMPT'
+runner_args = RunnerArguments(model_path='AIDC-AI/Ovis1.6-Gemma2-27B')
+runner = OvisRunner(runner_args)
+generation = runner.run([image, text])
+```
+Based on [Gradio](https://github.com/gradio-app/gradio), Ovis can also be accessed via a web user interface:
+```bash
+python ovis/serve/server.py --model_path MODEL_PATH --port PORT
+```
+## Quantization
+We quantized Ovis1.6 using AutoGPTQ. For detailed information on running and creating your own quantized version, please refer to the respective Huggingface model cards: [Ovis1.6-Gemma2-9B-GPTQ-Int4](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B-GPTQ-Int4) and [Ovis1.6-Llama3.2-3B-GPTQ-Int4](https://huggingface.co/AIDC-AI/Ovis1.6-Llama3.2-3B-GPTQ-Int4). Quantized Ovis1.6 maintains performance comparable to its non-quantized counterpart while requiring less GPU memory:
+- Benchmark performance:
+![performance-Ovis1_6-Gemma2-9B-GPTQ-Int4](docs/performance/Ovis1_6-Gemma2-9B-GPTQ-Int4.png)
+![performance-Ovis1_6-Llama3_2-3B-GPTQ-Int4](docs/performance/Ovis1_6-Llama3_2-3B-GPTQ-Int4.png)
+- GPU memory usage (max_partition=9):
+![performance-Ovis1_6-VRAM-Comparison](docs/performance/Ovis1_6-VRAM-Comparison.png)
+## Citation
+If you find Ovis useful, please cite the paper
+```
+@article{lu2024ovis,
+  title={Ovis: Structural Embedding Alignment for Multimodal Large Language Model},
+  author={Shiyin Lu and Yang Li and Qing-Guo Chen and Zhao Xu and Weihua Luo and Kaifu Zhang and Han-Jia Ye},
+  year={2024},
+  journal={arXiv:2405.20797}
+}
+```
+## Team
+This work is a collaborative effort by the MarcoVL team. We would also like to provide links to the following MLLM papers from our team:
+- [Parrot: Multilingual Visual Instruction Tuning](https://arxiv.org/abs/2406.02539)
+- [Wings: Learning Multimodal LLMs without Text-only Forgetting](https://arxiv.org/abs/2406.03496)
+## License
+This project is licensed under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt) (SPDX-License-Identifier: Apache-2.0).
+## Disclaimer
+We used compliance-checking algorithms during the training process, to ensure the compliance of the trained model to the best of our ability. Due to the complexity of the data and the diversity of language model usage scenarios, we cannot guarantee that the model is completely free of copyright issues or improper content. If you believe anything infringes on your rights or generates improper content, please contact us, and we will promptly address the matter.

PaddleMIX/.copyright.hook ADDED Viewed

	@@ -0,0 +1,134 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import argparse
+import io
+import re
+import sys
+import os
+import datetime
+COPYRIGHT = '''Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.'''
+def _generate_copyright(comment_mark):
+    copyright=COPYRIGHT.split(os.linesep)
+    header = copyright[0].rstrip()
+    p = re.search('(\d{4})', header).group(0)
+    now = datetime.datetime.now()
+    header = header.replace(p,str(now.year))
+    ans=[comment_mark + " " + header + os.linesep]
+    for idx, line in enumerate(copyright[1:]):
+        ans.append(comment_mark + " " + line.rstrip() + os.linesep)
+    return ans
+def _get_comment_mark(path):
+    lang_type=re.compile(r"\.(py|sh)$")
+    if lang_type.search(path) is not None:
+        return "#"
+    lang_type=re.compile(r"\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$")
+    if lang_type.search(path) is not None:
+        return "//"
+    return None
+RE_ENCODE = re.compile(r"^[ \t\v]*#.*?coding[:=]", re.IGNORECASE)
+RE_COPYRIGHT = re.compile(r".*Copyright( \(c\))* \d{4}", re.IGNORECASE)
+RE_SHEBANG = re.compile(r"^[ \t\v]*#[ \t]?\!")
+def _check_copyright(path):
+    head=[]
+    try:
+        with open(path) as f:
+            head = [next(f) for x in range(4)]
+    except StopIteration:
+        pass
+    for idx, line in enumerate(head):
+        if RE_COPYRIGHT.search(line) is not None:
+            return True
+    return False
+def generate_copyright(path, comment_mark):
+    original_contents = io.open(path, encoding="utf-8").readlines()
+    head = original_contents[0:4]
+    insert_line_no=0
+    for i, line in enumerate(head):
+        if RE_ENCODE.search(line) or RE_SHEBANG.search(line):
+            insert_line_no=i+1
+    copyright = _generate_copyright(comment_mark)
+    if insert_line_no == 0:
+        new_contents = copyright
+        if len(original_contents) > 0 and len(original_contents[0].strip()) != 0:
+            new_contents.append(os.linesep)
+        new_contents.extend(original_contents)
+    else:
+        new_contents=original_contents[0:insert_line_no]
+        new_contents.append(os.linesep)
+        new_contents.extend(copyright)
+        if len(original_contents) > insert_line_no and len(original_contents[insert_line_no].strip()) != 0:
+            new_contents.append(os.linesep)
+        new_contents.extend(original_contents[insert_line_no:])
+    new_contents="".join(new_contents)
+    with io.open(path, 'w') as output_file:
+        output_file.write(new_contents)
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description='Checker for copyright declaration.')
+    parser.add_argument('filenames', nargs='*', help='Filenames to check')
+    args = parser.parse_args(argv)
+    retv = 0
+    for path in args.filenames:
+        comment_mark = _get_comment_mark(path)
+        if comment_mark is None:
+            print("warning:Unsupported file", path, file=sys.stderr)
+            continue
+        if _check_copyright(path):
+            continue
+        generate_copyright(path, comment_mark)
+if __name__ == '__main__':
+    exit(main())

PaddleMIX/.style.yapf ADDED Viewed

	@@ -0,0 +1,3 @@

+[style]
+based_on_style = pep8
+column_limit = 80

PaddleMIX/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

PaddleMIX/README_EN.md ADDED Viewed

	@@ -0,0 +1,390 @@

+简体中文 | [English](README_EN.md)
+<p align="center">
+  <img src="https://github.com/PaddlePaddle/PaddleMIX/assets/22989727/2cd19298-1c52-4d73-a0f7-dcdab6a8ec90" align="middle" width = "600" />
+</p>
+<p align="center">
+    <a href="https://github.com/PaddlePaddle/PaddleMix/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/PaddleMix?color=ffa"></a>
+    <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
+    <a href=""><img src="https://img.shields.io/badge/python-3.7+-aff.svg"></a>
+    <a href=""><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg"></a>
+    <a href="#📌社区交流"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
+    <a href="https://github.com/PaddlePaddle/PaddleMIX/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/PaddleMIX?color=ccf"></a>
+</p>
+</div>
+## 💌 Table of Contents
+- [💌 Table of Contents](#table-of-contents)
+- [📰 News](#news)
+- [📣 Latest Developments](#latest-developments)
+- [🌈 Introduction](#introduction)
+- [✨ Key Features](#key-features)
+    - [📱 Rich Multimodal Capabilities](#rich-multimodal-capabilities)
+    - [🧩 Simple Development Experience](#simple-development-experience)
+    - [💡 High-Performance Distributed Training and Inference Capabilities](#high-performance-distributed-training-and-inference-capabilities)
+    - [🔧 Unique Features and Tools](#unique-features-and-tools)
+- [🔍 Installation](#installation)
+- [🔥 Tutorials](#tutorials)
+- [🤔 FAQ](#faq)
+- [📱 Model Library](#model-library)
+- [📝 License](#license)
+- [📌 Community](#community)
+## 📰 News
+**🔥PaddleMIX Development Project Challenge (November 21 - December 22, 2024)**
+**🔥2024.11.21 - 2024.12.22 PaddleMIX Development Project Challenge (Ended)**
+- ✨「Experience Officer Recruitment」PaddleMIX Development Project Challenge
+Click the link to register 🔗: [https://aistudio.baidu.com/activitydetail/1503019366](https://aistudio.baidu.com/activitydetail/1503019366)
+🏆 Submit to the PaddlePaddle Galaxy Community Project Hall to be featured and receive a PaddleMIX Experience Officer certification certificate and JD.com card incentives.
+Everyone is welcome to submit～
+<details>
+<summary>Click to view the event poster</summary>
+<p align="center">
+<img src='https://github.com/user-attachments/assets/27e0bbe3-0ff8-49ef-bd39-81a31a2b288b' width="25%">
+</p>
+</details>
+## 📣 Latest Developments
+**🎉 2024.12.17 Support for [InternVL2_5 (1B, 2B, 4B, 8B)](./paddlemix/examples/internvl2) inference**
+**🎉 2024.11.27 Added support for [Janus/JanusFlow](./paddlemix/examples/janus) inference**
+**🎉 2024.11.21 Added support for [MiniCPM-V-2_6](./paddlemix/examples/minicpm-v-2_6) inference**
+**🎉 2024.11.8 Support for [DenseConnector](./paddlemix/examples/llava_denseconnector) and [Aquila-VL-2B-llava-qwen](./paddlemix/examples/llava_onevision/) inference**
+**🎉 2024.11.1 Support for [LLaVA-OneVision](./paddlemix/examples/llava_onevision/) and [LLaVA-Critic](./paddlemix/examples/llava_critic/) inference**
+**🎉 2024.10.31 Welcome to the Update of External Developer's Creative [Tutorial Page](paddlemix_applications.md)**
+* 🌟 Since the launch of our Large Model Suite Premium Project Collection activity on September 6th, we have received 30 high-quality developer projects. Among them, 25 premium projects have successfully passed the platform evaluation and been featured.
+* 🙏 We sincerely thank all developers for their wonderful creations based on our suite! 🚀 We cordially invite you to share your creativity as well - welcome to publish your tutorials on public web pages or in the [PaddlePaddle AI Studio](https://aistudio.baidu.com/aistudio/community/multimodal?from=singlemessage) community!
+<details>
+<summary>Click to expand more</summary>
+**🔥 PaddleMIX v2.1 Released on 2024.10.11**
+* Supports the [PaddleNLP 3.0 beta](https://github.com/PaddlePaddle/PaddleNLP/releases/tag/v3.0.0-beta0) version, allowing early access to its latest features.
+* Added cutting-edge models like [Qwen2-VL](./paddlemix/examples/qwen2_vl/), [InternVL2](./paddlemix/examples/internvl2/), and [Stable Diffusion 3 (SD3)](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/examples/dreambooth/README_sd3.md).
+* Released our self-developed multimodal data capability tagging model [PP-InsCapTagger](./paddlemix/datacopilot/example/pp_inscaptagger/), which can be used for data analysis and filtering. Experimental cases show that it can reduce data volume by 50% while maintaining model performance, significantly improving training efficiency.
+* The multimodal large models InternVL2, LLaVA, SD3, and SDXL are now adapted to the Ascend 910B, offering training and inference capabilities on domestic computing chips.
+**PaddleMIX v2.0 Released on 2024.07.25**
+* Multimodal Understanding: Added LLaVA series, Qwen-VL, etc.; introduced Auto module to unify the SFT training process; introduced Mixtoken training strategy, increasing SFT throughput by 5.6 times.
+* Multimodal Generation: Released [PPDiffusers 0.24.1](./ppdiffusers/README.md), supporting video generation capabilities, and added LCM to the text-to-image model. Also added a PaddlePaddle version of PEFT and the Accelerate backend. Provided a ComfyUI plugin developed with PaddlePaddle.
+* Multimodal Data Processing Toolbox [DataCopilot](./paddlemix/datacopilot/): Supports custom data structures, data transformation, and offline format checks. Includes basic statistical information and data visualization functionality.
+**PaddleMIX v1.0 Released on 2023.10.7**
+* Added distributed training capabilities for vision-language pre-training models, and BLIP-2 now supports trillion-scale training.
+* Introduced the cross-modal application pipeline [AppFlow](./applications/README.md), which supports 11 cross-modal applications such as automatic annotation, image editing, and audio-to-image with one click.
+* [PPDiffusers](./ppdiffusers/README.md) released version 0.19.3, adding SDXL and related tasks.
+</details>
+---
+## 🌈 Introduction
+PaddleMIX is a multimodal large model development suite based on PaddlePaddle, integrating various modalities such as images, text, and video. It covers a wide range of multimodal tasks, including vision-language pre-training, fine-tuning, text-to-image, text-to-video, and multimodal understanding. It offers an out-of-the-box development experience while supporting flexible customization to meet diverse needs, empowering the exploration of general artificial intelligence.
+<p align="center">
+  <img src="https://github.com/user-attachments/assets/764b32a4-3933-4ef8-a0b2-dd425af49ef8" align="middle" width = 100% />
+</p>
+The PaddleMIX toolchain includes data processing, model development, pre-training, fine-tuning, and inference deployment, supporting mainstream multimodal models such as EVA-CLIP, BLIP-2, and Stable Diffusion. With cross-modal task pipelines like AppFlow and text-to-image application pipelines, developers can quickly build multimodal applications.
+### An example of multimodal understanding is shown below:
+<img src="https://github.com/user-attachments/assets/4c9a0427-57c7-4e1b-80f0-428c03119cc3"></img>
+Multimodal understanding 🤝 integrates visual 👀 and linguistic 💬 processing capabilities. It includes functions such as basic perception, fine-grained image understanding, and complex visual reasoning 🧠. Our [Model Library](#model-library) offers practical applications for single-image, multi-image, and video inference. Features include natural image summarization 📝, question answering 🤔, OCR 🔍, sentiment recognition ❤️😢, specialized image analysis 🔬, and code interpretation 💻. These technologies can be applied in various fields such as education 📚, healthcare 🏥, industry 🏭, and more, enabling comprehensive intelligent analysis from static images 🖼️ to dynamic videos 🎥. We invite you to experience and explore these capabilities!
+### An example of multimodal generation is shown below:
+<div style="display: flex; justify-content: center; gap: 5px;">
+    <img src="https://github.com/user-attachments/assets/f4768f08-f7a3-45e0-802c-c91554dc5dfc" style="height: 250px; object-fit: fill;">
+    <img src="https://github.com/user-attachments/assets/9bf4a333-af57-4ddd-a514-617dea8da435" style="height: 250px; object-fit: fill;">
+</div>
+Multimodal generation ✍️ combines the creative power of text 💬 and visuals 👀. It includes various technologies ranging from text-to-image 🖼️ to text-to-video 🎥, featuring advanced models like Stable Diffusion 3 and Open-Sora. We provide practical applications for single-image generation, multi-image synthesis, and video generation in [ppdiffusers](ppdiffusers/README.md). These features cover areas such as artistic creation 🎨, animation production 📽️, and content generation 📝. With these technologies, creative generation from static images to dynamic videos can be applied in fields like education 📚, entertainment 🎮, advertising 📺, and more. We invite you to experience and explore these innovations!
+### Example of featured applications (click the titles for a quick jump to the online experience):
+|                                                  [**ComfyUI Creative Workflow**](https://aistudio.baidu.com/community/app/106043)                                                  |                                                [**Art Style QR Code Model**](https://aistudio.baidu.com/community/app/1339)                                                |                                                  [**Mix Image Overlay**](https://aistudio.baidu.com/community/app/1340)                                                  |
+| :--------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------: |
+| <img src='https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/36ba7261-1744-41a4-b1cb-c9e99f6931f2' width="300px"> | <img src='https://github.com/PaddlePaddle/Paddle/assets/22989727/ba091291-a1ee-49dc-a1af-fc501c62bfc8'  width="300px"> | <img src='https://github.com/PaddlePaddle/Paddle/assets/22989727/a71be5a0-b0f3-4aa8-bc20-740ea8ae6785'  width="300px"> |
+|                                                  [**Anime Text-to-Image**](https://aistudio.baidu.com/community/app/2/webUI?source=appCenter)                                                   |                                                     [**AI Art｜50+ Lora Style Overlays**](https://aistudio.baidu.com/community/app/2848/webUI?source=appCenter)                                                     |                                               [**ControlNet｜Partial Image Repainting**](https://aistudio.baidu.com/community/app/1981/webUI?source=appCenter)                                               |
+| <img src='https://github.com/user-attachments/assets/a4af8f8a-08c7-4da7-8575-9dbfedaba56c' width="200px"> | <img src='https://github.com/user-attachments/assets/fa92c229-a885-46a1-b23f-a076855c93ec'  width="200px"> | <img src='https://github.com/user-attachments/assets/78625876-d8ec-4c15-ae96-655c50f562ab'  width="200px"> |
+-----
+## ✨ Key Features
+### 📱 Rich Multimodal Capabilities
+PaddleMIX supports a wide range of the latest mainstream algorithm benchmarks and pre-trained models, covering vision-language pre-training, text-to-image, cross-modal visual tasks, and enabling diverse functionalities such as image editing, image description, and data annotation. `Gateway`: [📱 Model Library](#model-library)
+### 🧩 Simple Development Experience
+PaddleMIX provides a unified model development interface, allowing developers to quickly integrate and customize models. With the Auto module, users can efficiently load pre-trained models, perform tokenization, and easily complete model training, fine-tuning (SFT), inference, and deployment through a simplified API. Additionally, the Auto module supports developers in customizing automated model integration, ensuring flexibility and scalability while enhancing development efficiency.
+### 💡 High-Performance Distributed Training and Inference Capabilities
+PaddleMIX offers high-performance distributed training and inference capabilities, integrating acceleration operators like ✨Fused Linear✨ and ✨Flash Attention✨. It supports 🌀BF16 mixed-precision training and 4D mixed-parallel strategies. By optimizing inference performance through convolution layout, GroupNorm fusion, and rotating positional encoding optimization, it significantly enhances large-scale pre-training and efficient inference performance.
+<img src="https://github.com/user-attachments/assets/9ab9540a-fa89-41cb-838d-95df86e33382" width = 100% />
+### 🔧 Unique Features and Tools
+The multimodal data processing toolbox, DataCopilot, accelerates model iteration and upgrades. It allows developers to perform basic data operations with low code based on specific tasks. `Gateway`: [🏆 Featured Models | Tools](#featured-models-tools)
+## 🔍 Installation
+### 1. Clone PaddleMIX Repository
+```
+git clone https://github.com/PaddlePaddle/PaddleMIX
+cd PaddleMIX
+```
+### 2. Create Virtual Environment
+```
+conda create -n paddlemix python=3.10 -y
+conda activate paddlemix
+```
+### 3. ‼️ Install PaddlePaddle
+#### Method 1: One-click Installation (Recommended for GPU/CPU)
+- CUDA 11.x or 12.3
+- PaddlePaddle 3.0.0b1
+```
+sh build_paddle_env.sh
+```
+#### Method 2: Manual Installation
+For detailed instructions on installing PaddlePaddle, please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html).
+### 4. ‼️ Install Dependencies
+#### Method 1: One-Click Installation (Recommended)
+```
+sh build_env.sh
+```
+#### Method 2: Manual Installation
+```bash
+# Install PaddleMIX
+pip install -e .
+# Install ppdiffusers
+cd ppdiffusers
+pip install -e .
+cd ..
+### 5. ‼️ Verify Installation
+Run the following command to verify your installation:
+```bash
+sh check_env.sh
+```
+Recommended versions for environment and dependencies:
+- paddlepaddle: 3.0.0b2 or develop version
+- paddlenlp: 3.0.0b2
+- ppdiffusers: 0.29.0
+- huggingface_hub: 0.23.0
+### 6. Install Custom Operators (Optional)
+* Some models require custom operators (FastLayerNorm, FusedLayerNorm), such as EVA-CLIP, DIT_LLAMA, etc.
+* Skip this step for non-CUDA environments (e.g., Ascend NPU)
+* ```bash
+cd paddlemix/external_ops
+python setup.py install
+```
+#### Method 2: Manual Installation (Please refer to build_env.sh)
+## 🔥 Tutorials
+**Quick Start**
+- [Multimodal Understanding: Beginner's Guide [Example: InternVL2 Model]](paddlemix/examples/internvl2/README.md)
+- [Multimodal Generation: Zero to Hero Guide [Example: Stable Diffusion Model]](ppdiffusers/examples/stable_diffusion/README.md)
+- [Cross-modal Task Pipeline: Getting Started](applications/README.md/#getting-started)
+**Hands-On Practice & Examples**
+- [LLaVA Model: Full Process Practice from Training to Inference](https://aistudio.baidu.com/projectdetail/7917712)
+- [SDXL Application: Create Your Own Olympic Poster Generator](https://aistudio.baidu.com/projectdetail/8251202)
+- [PaddleMIX Multimodal AI Applications: Project Classification Overview](./paddlemix_applications.md)
+**Multi-Hardware Usage**
+- For the model list and usage supported by Ascend 910B, please refer to [Ascend Hardware Usage](./docs/hardware_support/ascend_usage.md)
+**Data Preparation & Fine-Tuning**
+- [Model Training and Fine-Tuning Techniques](paddlemix/tools/README.md)
+**Inference Deployment**
+- [Deployment Guide: From Development to Production Environment](deploy/README.md)
+## 📱 Model Library
+<table align="center">
+  <tbody>
+    <tr align="center" valign="center">
+      <td>
+        <b>Multimodal Understanding</b>
+      </td>
+      <td>
+        <b>Multimodal Generation</b>
+      </td>
+      <td>
+        <b>Unified Multimodal Foundation Model</b>
+      </td>
+    </tr>
+    <tr valign="top">
+      <td>
+        <ul>
+        </ul>
+          <li><b>Image-Text Pre-training</b></li>
+        <ul>
+            <li><a href="paddlemix/examples/clip">CLIP</a></li>
+            <li><a href="paddlemix/examples/evaclip">EVA-CLIP</a></li>
+            <li><a href="paddlemix/examples/llava">LLaVA-1.5</a></li>
+            <li><a href="paddlemix/examples/llava">LLaVA-1.6</a></li>
+            <li><a href="paddlemix/examples/llava">LLaVA-NeXT</a></li>
+            <li><a href="paddlemix/examples/llava_onevision">LLaVA-onevision</a></li>
+            <li><a href="paddlemix/examples/llava_onevision">Aquila-VL-2B-llava-qwen</a></li>
+            <li><a href="paddlemix/examples/llava_critic">LLaVA-Critic</a></li>
+            <li><a href="paddlemix/examples/llava_denseconnector">LLaVA-DenseConnector</a></li>
+            <li><a href="paddlemix/examples/qwen_vl">Qwen-VL</a></li>
+            <li><a href="paddlemix/examples/qwen2_vl">Qwen2-VL</a></li>
+            <li><a href="paddlemix/examples/internvl2">InternVL2</a></li>
+            <li><a href="paddlemix/examples/minimonkey">Mini-Monkey</a></li>
+            <li><a href="paddlemix/examples/coca">CoCa</a></li>
+            <li><a href="paddlemix/examples/blip2">BLIP-2</a></li>
+            <li><a href="paddlemix/examples/minigpt4">miniGPT-4</a></li>
+            <li><a href="paddlemix/examples/visualglm">VIsualGLM</a></li>
+            <li><a href="paddlemix/examples/cogvlm">CogVLM && CogAgent</a></li>
+            <li><a href="paddlemix/examples/internlm_xcomposer2">InternLM-XComposer2</a></li>
+      </ul>
+      </ul>
+          <li><b>Open-World Visual Model</b></li>
+        <ul>
+            <li><a href="paddlemix/examples/groundingdino">Grounding DINO</a></li>
+            <li><a href="paddlemix/examples/sam">SAM</a></li>
+            <li><a href="paddlemix/examples/YOLO-World">YOLO-World</a></li>
+      </ul>
+      </ul>
+          <li><b>More Multimodal Pre-trained Models</b></li>
+        <ul>
+            <li><a href="paddlemix/examples/imagebind">ImageBind</a></li>
+      </ul>
+      </ul>
+        <li><b>Data Analysis</b></li>
+      <ul>
+          <li><a href="./paddlemix/datacopilot/example/pp_inscaptagger/">PP-InsCapTagger</a></li>
+      </ul>
+      </td>
+      <td>
+        <ul>
+        </ul>
+          <li><b>Text-to-Image</b></li>
+        <ul>
+           <li><a href="ppdiffusers/examples/stable_diffusion">Stable Diffusion</a></li>
+           <li><a href="ppdiffusers/examples/dreambooth/README_sd3.md">Stable Diffusion 3 (SD3)</a></li>
+            <li><a href="ppdiffusers/examples/controlnet">ControlNet</a></li>
+            <li><a href="ppdiffusers/examples/t2i-adapter">T2I-Adapter</a></li>
+            <li><a href="ppdiffusers/examples/text_to_image_laion400m">LDM</a></li>
+            <li><a href="ppdiffusers/ppdiffusers/pipelines/unidiffuser">Unidiffuser</a></li>
+            <li><a href="ppdiffusers/examples/class_conditional_image_generation/DiT">DiT</a></li>
+            <li><a href="ppdiffusers/examples/HunyuanDiT">HunyuanDiT</a></li>
+        </ul>
+        </ul>
+          <li><b>Text-to-Video</b></li>
+        <ul>
+           <li><a href="ppdiffusers/examples/lvdm">LVDM</a></li>
+           <li><a href="ppdiffusers/examples/stable_video_diffusion">SVD</a></li>
+           <li><a href="ppdiffusers/examples/AnimateAnyone">AnimateAnyone</a></li>
+           <li><a href="ppdiffusers/examples/Open-Sora">OpenSora</a></li>
+        </ul>
+        </ul>
+          <li><b>Audio Generation</b></li>
+        <ul>
+           <li><a href="ppdiffusers/ppdiffusers/pipelines/audioldm">AudioLDM</a></li>
+           <li><a href="ppdiffusers/ppdiffusers/pipelines/audioldm2">AudioLDM2</a></li>
+        </ul>
+      </td>
+      <td>
+        <ul>
+        </ul>
+          <li><b>Unified Multimodal Model</b></li>
+        <ul>
+          <li><a href="paddlemix/examples/janus">Janus</a></li>
+        </ul>
+      </td>
+    </tr>
+  </tbody>
+</table>
+For more model capabilities, please refer to the [Model Capability Matrix](./paddlemix/examples/README.md)
+## 🏆 Featured Models | Tools
+### 💎 Cross-Modal Task Pipeline AppFlow
+<details>
+<summary><b> Introduction (Click to Expand)</b></summary>
+AppFlow, as the cross-modal application task pipeline of PaddleMIX, possesses powerful functionality and ease of use. By integrating cutting-edge algorithms such as LLaVA and Stable Diffusion, AppFlow has comprehensively covered various modalities including images, text, audio, and video. Through a flexible pipeline approach, it has constructed over ten multimodal applications, encompassing text-image generation, text-video generation, text-audio generation, image understanding, and more, providing users with rich demo examples. The highlight of AppFlow is its one-click prediction feature, allowing users to complete model inference with simple commands, eliminating cumbersome training and extensive coding, significantly lowering the barrier to use. Additionally, AppFlow fully leverages the dynamic-static unification advantages of the PaddlePaddle framework; users only need to set simple parameters to automatically complete model dynamic-to-static export and high-performance inference, enhancing work efficiency and optimizing model performance for one-stop application deployment.
+`Gateway`: [Application Documentation Example](applications/README.md/#quick-start).
+</details>
+### 💎 Multimodal Data Processing Toolbox DataCopilot
+<details>
+<summary><b> Introduction (Click to Expand)</b></summary>
+In real-world application scenarios, there is a substantial demand for fine-tuning multimodal large models using proprietary data to enhance model performance, making data elements the core of this process. Based on this, PaddleMIX provides the DataCopilot tool for data processing and analysis, allowing developers to achieve an end-to-end development experience within the PaddleMIX suite.
+PP-InsCapTagger (Instance Capability Tagger) is a dataset capability tagging model implemented by DataCopilot based on PaddleMIX. It is used to label the capabilities of multimodal data instances. By optimizing the dataset through instance capability distribution, it can improve model training efficiency and provide an efficient solution for dataset analysis and evaluation. Combining the model inference labeling results with the LLaVA SFT dataset optimization can **improve LLaVA model training efficiency by 50% during the SFT phase.**
+`Gateway`: [Application Documentation Example](paddlemix/datacopilot/readme.md).
+</details>
+<details>
+<summary><b> PP-InsCapTagger (Click to Expand)</b></summary>
+| Model                           | ScienceQA                               | TextVQA                                | VQAv2                                  | GQA                                    | MMMU                                   | MME                                     |
+|----------------------------------|-----------------------------------------|----------------------------------------|----------------------------------------|----------------------------------------|----------------------------------------|-----------------------------------------|
+| llava-1.5-7b (origin)            | 66.8                                    | 58.2                                   | 78.5                                   | 62                                     | -                                      | -                                       |
+| llava-1.5-7b (rerun)             | 69.01                                   | 57.6                                   | 79                                     | 62.95                                  | 36.89                                  | 1521<br>323                             |
+| llava-1.5-7b (random 50%)        | 67.31                                   | 55.6                                   | 76.89                                  | 61.01                                  | 34.67                                  | 1421<br>286                             |
+| **llava-1.5-7b (our 50%)**       | **70.24** *(+2.93)*                     | **57.12** *(+1.52)*                    | **78.32** *(+1.43)*                    | **62.14** *(+1.13)*                    | **37.11** *(+2.44)*                    | **1476** *(+55)*<br>**338** *(+52)*    |
+`Gateway`: [Application Documentation Example](paddlemix/datacopilot/example/pp_inscaptagger/readme.md).
+</details>
+## 🤔 FAQ
+For answers to some common questions about our project, please refer to the [FAQ](docs/FAQ.md). If your question is not addressed, feel free to raise it in the [Issues](https://github.com/PaddlePaddle/PaddleMIX/issues).
+## 📝 License
+This project is released under the [Apache 2.0 license](LICENSE).
+## 📌 Community Communication
+- Scan the QR code and fill out the questionnaire to join the communication group and engage deeply with numerous community developers and the official team.
+<div align="center">
+    <img src="https://github.com/user-attachments/assets/ecf292da-9ac6-41cb-84b6-df726ef4522d" width="300" height="300" />
+</div>

PaddleMIX/VERSION ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0.1.0

PaddleMIX/check_env.sh ADDED Viewed

	@@ -0,0 +1,101 @@

+#!/bin/bash
+# 设置错误时退出
+set -e
+# 查找可用的Python解释器
+find_python() {
+    for cmd in python3 python python3.8 python3.9 python3.10; do
+        if command -v "$cmd" > /dev/null 2>&1; then
+            if $cmd -c "import sys; exit(0 if sys.version_info >= (3,7) else 1)" 2>/dev/null; then
+                echo "$cmd"
+                return 0
+            fi
+        fi
+    done
+    return 1
+}
+# 查找Python解释器
+PYTHON_CMD=$(find_python)
+if [ -z "$PYTHON_CMD" ]; then
+    echo "错误: 未找到合适的Python环境 (需要Python >= 3.7)"
+    exit 1
+fi
+echo "使用Python环境: $($PYTHON_CMD --version)"
+echo "=====================Package Versions====================="
+# 检查paddlepaddle版本
+echo "检查paddlepaddle版本..."
+if $PYTHON_CMD -c "import paddle" 2>/dev/null; then
+    paddle_version=$($PYTHON_CMD -c "import paddle; print(paddle.__version__)")
+    echo "当前paddlepaddle版本: $paddle_version"
+    # 检查是否为GPU版本
+    if $PYTHON_CMD -c "import paddle; print(paddle.device.is_compiled_with_cuda())" 2>/dev/null | grep -q "True"; then
+        echo "paddlepaddle类型: GPU版本"
+        cuda_version=$($PYTHON_CMD -c "import paddle; print(paddle.device.get_cudnn_version() / 100)")
+        echo "CUDA版本: $cuda_version"
+    else
+        echo "⚠️ paddlepaddle类型: CPU版本，推荐使用GPU版本"
+    fi
+    if [[ "$paddle_version" == "3.0.0b2" || "$paddle_version" == *"0.0.0"* ]]; then
+        echo "✅ paddlepaddle版本符合要求"
+    else
+        echo "⚠️ 建议使用paddlepaddle 3.0.0b2或develop版本"
+    fi
+else
+    echo "❌ 未安装paddlepaddle"
+fi
+# 检查paddlenlp版本
+echo -e "\n检查paddlenlp版本..."
+if $PYTHON_CMD -c "import paddlenlp" 2>/dev/null; then
+    paddlenlp_version=$($PYTHON_CMD -c "import paddlenlp; print(paddlenlp.__version__)")
+    echo "当前paddlenlp版本: $paddlenlp_version"
+    if [[ "$paddlenlp_version" == "3.0.0b2" ]]; then
+        echo "✅ paddlenlp版本符合要求"
+    else
+        echo "⚠️ 建议使用paddlenlp 3.0.0b2版本"
+    fi
+else
+    echo "❌ 未安装paddlenlp"
+fi
+# 检查ppdiffusers版本
+echo -e "\n检查ppdiffusers版本..."
+if $PYTHON_CMD -c "import ppdiffusers" 2>/dev/null; then
+    ppdiffusers_version=$($PYTHON_CMD -c "import ppdiffusers; print(ppdiffusers.__version__)")
+    echo "当前ppdiffusers版本: $ppdiffusers_version"
+    if [[ "$ppdiffusers_version" == "0.29.0" ]]; then
+        echo "✅ ppdiffusers版本符合要求"
+    else
+        echo "⚠️ 建议使用ppdiffusers 0.29.0版本"
+    fi
+else
+    echo "❌ 未安装ppdiffusers"
+fi
+# 检查huggingface_hub版本
+echo -e "\n检查huggingface_hub版本..."
+if $PYTHON_CMD -c "import huggingface_hub" 2>/dev/null; then
+    hf_version=$($PYTHON_CMD -c "import huggingface_hub; print(huggingface_hub.__version__)")
+    echo "当前huggingface_hub版本: $hf_version"
+    if [[ "$hf_version" == "0.23.0" ]]; then
+        echo "✅ huggingface_hub版本符合要求"
+    else
+        echo "⚠️ 建议使用huggingface_hub 0.23.0版本"
+    fi
+else
+    echo "❌ 未安装huggingface_hub"
+fi
+echo -e "\n===================Version Summary===================="
+echo "推荐版本:"
+echo "- paddlepaddle: 3.0.0b2或develop版本"
+echo "- paddlenlp: 3.0.0b2"
+echo "- ppdiffusers: 0.29.0"
+echo "- huggingface_hub: 0.23.0"
+echo "===================================================="

PaddleMIX/pyproject.toml ADDED Viewed

	@@ -0,0 +1,23 @@

+[tool.isort]
+profile = 'black'
+known_third_party = ["paddle"]
+[tool.black]
+line-length = 119
+target_version = ['py35', 'py36', 'py37', 'py38', 'py39', 'py310']
+exclude = ['.flake8']
+[tool.pytest.ini_options]
+minversion = "6.0"
+pythonpath = ["."]
+testpaths = [
+    # "tests/models",
+]
+python_files = [
+    "test.py",
+    "test_*.py"
+]
+filterwarnings = [
+    "ignore::UserWarning",
+    'ignore::DeprecationWarning',
+]

PaddleMIX/requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+numpy
+paddlenlp>=3.0.0b2
+tensorboardX
+opencv-python
+Pillow
+pycocoevalcap
+ftfy
+regex
+einops>=0.6.1
+soundfile
+librosa
+h5py
+jsonschema>=4.19.0
+referencing>=0.32.1
+decord>=0.6.0

VILA/LongVILA.md ADDED Viewed

	@@ -0,0 +1,79 @@

+<p align="center">
+  <img src="demo_images/longvila-logo.png" width="60%"/>
+</p>
+# LongVILA: Scaling Long-Context Visual Language Models for Long Videos
+[![Code License](https://img.shields.io/badge/Code%20License-Apache_2.0-green.svg)](CODE_LICENSE)
+[![Model License](https://img.shields.io/badge/MODEL%20License-CC%20By%20NC%204.0-red.svg)](MODEL_LICENSE)
+[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/)
+[![Paper](https://img.shields.io/badge/Paper-Arvix%20Link-green)](https://arxiv.org/abs/2408.10188)
+[![Huggingface Models](https://img.shields.io/badge/Models-Huggingface%20Models-bron)](https://huggingface.co/collections/Efficient-Large-Model/longvila-66c3fce79284c8209f119b32)
+## 💡 Introduction
+Long-context capability is critical for multi-modal foundation models. We introduce LongVILA, a full-stack solution for long-context vision-language models, including system, model training, and dataset development. On the system side, we introduce the first long-context Multi-Modal Sequence Parallelism (MM-SP) system that enables long training and inference, enabling 2M context length training on 256 GPUs. MM-SP is also efficient, being 2.1x - 5.7x faster than Ring-Style Sequence Parallelism and 1.1x - 1.4x faster than Megatron-LM in text-only settings. Moreover, it seamlessly integrates with Hugging Face Transformers. For model training, we propose a five-stage pipeline comprising alignment, pre-training, short supervised fine-tuning, context extension, and long supervised fine-tuning. Regarding datasets, we meticulously construct large-scale visual language pre-training datasets and long video instruction-following datasets to support our multi-stage training process. The full-stack solution extends the feasible frame number of VILA by a factor of 128 (from 8 to 1024 frames) and improves long video captioning score from 2.00 to 3.26 (1.6x), achieving 99.5% accuracy in 1400-frames video (274k context length) needle in a haystack. LongVILA-8B also demonstrates consistent accuracy improvements on long videos in the VideoMME benchmark as the video frames increase.
+<p align="center">
+  <img src="demo_images/LongVILA-pipeline.png" width="100%"/>
+</p>
+## Installation
+```bash
+./environment_setup.sh vila
+```
+## Evaluations
+Please refer to `scripts/v1_5/eval/needle.sh`, `scripts/v1_5/eval/video_chatgpt/run_vila_benchmark.sh`, and `llava/eval/video_mme/eval.sh` for needle-in-a-haystack, LongVILA-Caption, and Video MME evaluations.
+> [!Note]
+>  💡**Sequence Parallelism Configuration**
+>
+> To enable sequence parallelism, you can set the following parameters in the training script:
+>
+> `seq_parallel_size`:The degree of sequence parallelism (SP). SP is disabled by default (value: -1).
+>
+> `seq_parallel_ring_size`: The communication process group size using optimized Ring Attention approach in SP. Ring Attention approach is disabled by default in SP.
+>
+> `seq_parallel_ring_type`: Ring Attention implementation. Support ['ring_varlen', 'zigzag_ring_varlen'] in 2D attention. Only works when *seq_parallel_ring_size* > 1.
+>
+> Please note that when SP is enabled, we treat each group of seq_parallel_size GPUs as a single device, with the global batch size calculated as the product of the per-device batch size and the data parallelism size.
+## 🔒 License
+- The code is released under the Apache 2.0 license as found in the [LICENSE](./LICENSE) file.
+- The pretrained weights are released under the [CC-BY-NC-SA-4.0 license](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en).
+- The service is a research preview intended for non-commercial use only, and is subject to the following licenses and terms:
+  - [Model License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA. For LLAMA3-VILA checkpoints terms of use, please refer to the [LLAMA3 License](https://llama.meta.com/llama3/license/) for additional details.
+  - [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI
+  - [Dataset Licenses](./data_prepare/LICENSE) for each one used during training.
+## Citations
+```
+@article{longvila,
+      title={LongVILA: Scaling Long-Context Visual Language Models for Long Videos},
+      author={Fuzhao Xue and Yukang Chen and Dacheng Li and Qinghao Hu and Ligeng Zhu and Xiuyu Li and Yunhao Fang and Haotian Tang and Shang Yang and Zhijian Liu and Yihui He and Hongxu Yin and Pavlo Molchanov and Jan Kautz and Linxi Fan and Yuke Zhu and Yao Lu and Song Han},
+      year={2024},
+      eprint={2408.10188},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+# Acknowledgement
+- [LLaVA](https://github.com/haotian-liu/LLaVA): the codebase we built upon. Thanks for their wonderful work.
+- [LongVA](https://github.com/EvolvingLMMs-Lab/LongVA): we borrowed the long video needle in the haystack evaluation script from this repository.
+- [LongLoRA](https://github.com/dvlab-research/LongLoRA): we modified the low-rank long-context fine-tuning code from this repository.
+- [USP (YunChang)](https://github.com/feifeibear/long-context-attention): we adopted the 2D attention implementation from this repository.
+- [RingFlashAttention](https://github.com/zhuzilin/ring-flash-attention): we adopted the ring flash attention implementation from this repository.
+- [DeepSpeed Ulysses](https://github.com/microsoft/DeepSpeed): we adopted the all-to-all implementation from this repository.
+- [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT): we borrowed video evaluation script from this repository.
+- [MMC4](https://github.com/allenai/mmc4), [COYO-700M](https://github.com/kakaobrain/coyo-dataset), [M3IT](https://huggingface.co/datasets/MMInstruction/M3IT), [OpenORCA/FLAN](https://huggingface.co/datasets/Open-Orca/FLAN), [ShareGPT4V](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4V), [WIT](google-research-datasets/wit), [GSM8K-ScRel](https://github.com/OFA-Sys/gsm8k-ScRel/blob/main/data/train_use.jsonl), [VisualGenome](https://visualgenome.org/api/v0/api_home.html), [VCR](https://visualcommonsense.com/download/), [ScienceQA](https://huggingface.co/datasets/derek-thomas/ScienceQA), [Shot2Story](https://github.com/bytedance/Shot2Story/blob/master/DATA.md), [Youcook2](http://youcook2.eecs.umich.edu/), [Vatex](https://eric-xw.github.io/vatex-website/download.html), [ShareGPT-Video](https://huggingface.co/datasets/ShareGPTVideo/train_video_and_instruction), [ShareGPT4o](https://sharegpt4o.github.io/) for providing datasets used in this research.

VILA/convert_ckpt.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import json
+import os.path as osp
+from collections import OrderedDict
+from glob import glob
+from safetensors import safe_open
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    LlamaForCausalLM,
+    PretrainedConfig,
+    PreTrainedModel,
+)
+import llava.model.language_model.llava_llama
+from llava.model import *
+from llava.model.configuration_llava import LlavaConfig
+from llava.model.language_model.builder import build_llm_and_tokenizer
+from llava.model.multimodal_encoder.builder import SiglipVisionTower, build_vision_tower
+from llava.model.multimodal_encoder.siglip import SiglipImageProcessor, SiglipVisionConfig, SiglipVisionModel
+from llava.model.multimodal_projector.builder import build_mm_projector
+from llava.model.utils import get_model_config
+def main(
+    path="~/workspace/VILA/checkpoints/Llama-2-7b-hf-google/siglip-large-patch16-384-align-llava_1_5_mm_align",
+    output_dir="checkpoints/converted_models",
+):
+    path = osp.expanduser(path)
+    # assuming 7b llama + siglip
+    config = AutoConfig.from_pretrained("CI-new-format-llama7b-siglip")
+    model = AutoModel.from_config(config)
+    # kep mapping
+    state_dict = {}
+    def fn(k):
+        if (
+            k.startswith("model.layers")
+            or k.startswith("model.norm")
+            or k.startswith("model.embed_tokens")
+            or k.startswith("lm_head")
+        ):
+            # llm layer
+            new_k = "llm." + k
+            return new_k
+        if k.startswith("model.vision_tower.vision_tower.vision_model."):
+            new_k = k.replace(
+                "model.vision_tower.vision_tower.vision_model.", "vision_tower.vision_tower.vision_model."
+            )
+            return new_k
+        if k.startswith("model.mm_projector"):
+            new_k = k.replace("model.mm_projector.", "mm_projector.layers.")
+            return new_k
+        return k
+    for sf in glob(osp.join(path, "*.safetensors")):
+        with safe_open(sf, framework="pt") as f:
+            for key in f.keys():
+                state_dict[fn(key)] = f.get_tensor(key)
+    for k in state_dict.keys():
+        assert k in model.state_dict().keys()
+    model.load_state_dict(state_dict)
+    model.save_pretrained(output_dir)
+if __name__ == "__main__":
+    import fire
+    fire.Fire(main)

VILA/environment_setup.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/usr/bin/env bash
+# This is required to activate conda environment
+eval "$(conda shell.bash hook)"
+# CONDA_ENV=${1:-""}
+CONDA_ENV=vila
+if [ -n "$CONDA_ENV" ]; then
+    conda create -n $CONDA_ENV python=3.10 -y
+    conda activate $CONDA_ENV
+else
+    echo "Skipping conda environment creation. Make sure you have the correct environment activated."
+fi
+# This is required to enable PEP 660 support
+pip install --upgrade pip
+# This is optional if you prefer to use built-in nvcc
+conda install -c nvidia cuda-toolkit -y
+# Install FlashAttention2
+pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+# Install VILA
+pip install -e .
+pip install -e ".[train]"
+pip install -e ".[eval]"
+# Install HF's Transformers
+pip install git+https://github.com/huggingface/transformers@v4.37.2
+site_pkg_path=$(python -c 'import site; print(site.getsitepackages()[0])')
+cp -rv ./llava/train/transformers_replace/* $site_pkg_path/transformers/
+cp -rv ./llava/train/deepspeed_replace/* $site_pkg_path/deepspeed/

VILA/predict.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is originated from: https://github.com/haotian-liu/LLaVA/
+import os
+import subprocess
+import time
+from io import BytesIO
+from threading import Thread
+import requests
+import torch
+from cog import BasePredictor, ConcatenateIterator, Input, Path
+from PIL import Image
+from transformers.generation.streamers import TextIteratorStreamer
+from llava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
+from llava.conversation import SeparatorStyle, conv_templates
+from llava.mm_utils import KeywordsStoppingCriteria, tokenizer_image_token
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+os.environ["HUGGINGFACE_HUB_CACHE"] = os.getcwd() + "/weights"
+# url for the weights mirror
+REPLICATE_WEIGHTS_URL = "https://weights.replicate.delivery/default"
+# files to download from the weights mirrors
+weights = [
+    {
+        "dest": "liuhaotian/llava-v1.5-13b",
+        # git commit hash from huggingface
+        "src": "llava-v1.5-13b/006818fc465ebda4c003c0998674d9141d8d95f8",
+        "files": [
+            "config.json",
+            "generation_config.json",
+            "pytorch_model-00001-of-00003.bin",
+            "pytorch_model-00002-of-00003.bin",
+            "pytorch_model-00003-of-00003.bin",
+            "pytorch_model.bin.index.json",
+            "special_tokens_map.json",
+            "tokenizer.model",
+            "tokenizer_config.json",
+        ],
+    },
+    {
+        "dest": "openai/clip-vit-large-patch14-336",
+        "src": "clip-vit-large-patch14-336/ce19dc912ca5cd21c8a653c79e251e808ccabcd1",
+        "files": ["config.json", "preprocessor_config.json", "pytorch_model.bin"],
+    },
+]
+def download_json(url: str, dest: Path):
+    res = requests.get(url, allow_redirects=True)
+    if res.status_code == 200 and res.content:
+        with dest.open("wb") as f:
+            f.write(res.content)
+    else:
+        print(f"Failed to download {url}. Status code: {res.status_code}")
+def download_weights(baseurl: str, basedest: str, files: list[str]):
+    basedest = Path(basedest)
+    start = time.time()
+    print("downloading to: ", basedest)
+    basedest.mkdir(parents=True, exist_ok=True)
+    for f in files:
+        dest = basedest / f
+        url = os.path.join(REPLICATE_WEIGHTS_URL, baseurl, f)
+        if not dest.exists():
+            print("downloading url: ", url)
+            if dest.suffix == ".json":
+                download_json(url, dest)
+            else:
+                subprocess.check_call(["pget", url, str(dest)], close_fds=False)
+    print("downloading took: ", time.time() - start)
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """Load the model into memory to make running multiple predictions efficient"""
+        for weight in weights:
+            download_weights(weight["src"], weight["dest"], weight["files"])
+        disable_torch_init()
+        self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
+            "liuhaotian/llava-v1.5-13b", model_name="llava-v1.5-13b", model_base=None, load_8bit=False, load_4bit=False
+        )
+    def predict(
+        self,
+        image: Path = Input(description="Input image"),
+        prompt: str = Input(description="Prompt to use for text generation"),
+        top_p: float = Input(
+            description="When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens",
+            ge=0.0,
+            le=1.0,
+            default=1.0,
+        ),
+        temperature: float = Input(
+            description="Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic",
+            default=0.2,
+            ge=0.0,
+        ),
+        max_tokens: int = Input(
+            description="Maximum number of tokens to generate. A word is generally 2-3 tokens", default=1024, ge=0
+        ),
+    ) -> ConcatenateIterator[str]:
+        """Run a single prediction on the model"""
+        conv_mode = "llava_v1"
+        conv = conv_templates[conv_mode].copy()
+        image_data = load_image(str(image))
+        image_tensor = self.image_processor.preprocess(image_data, return_tensors="pt")["pixel_values"].half().cuda()
+        # loop start
+        # just one turn, always prepend image token
+        inp = DEFAULT_IMAGE_TOKEN + "\n" + prompt
+        conv.append_message(conv.roles[0], inp)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = (
+            tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
+        )
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
+        streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, timeout=20.0)
+        with torch.inference_mode():
+            thread = Thread(
+                target=self.model.generate,
+                kwargs=dict(
+                    inputs=input_ids,
+                    images=image_tensor,
+                    do_sample=True,
+                    temperature=temperature,
+                    top_p=top_p,
+                    max_new_tokens=max_tokens,
+                    streamer=streamer,
+                    use_cache=True,
+                    stopping_criteria=[stopping_criteria],
+                ),
+            )
+            thread.start()
+            # workaround: second-to-last token is always " "
+            # but we want to keep it if it's not the second-to-last token
+            prepend_space = False
+            for new_text in streamer:
+                if new_text == " ":
+                    prepend_space = True
+                    continue
+                if new_text.endswith(stop_str):
+                    new_text = new_text[: -len(stop_str)].strip()
+                    prepend_space = False
+                elif prepend_space:
+                    new_text = " " + new_text
+                    prepend_space = False
+                if len(new_text):
+                    yield new_text
+            if prepend_space:
+                yield " "
+            thread.join()
+def load_image(image_file):
+    if image_file.startswith("http") or image_file.startswith("https"):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+    else:
+        image = Image.open(image_file).convert("RGB")
+    return image

VLMEvalKit/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+exclude: |
+  (?x)^(
+      scripts/|
+      assets/|
+      vlmeval/config.py
+  )
+repos:
+  - repo: https://github.com/PyCQA/flake8
+    rev: 6.1.0
+    hooks:
+      - id: flake8
+        args: ["--max-line-length=120", "--ignore=F401,F403,F405,E402,E722,E741,W503,E231,E702"]
+        exclude: ^configs/
+  - repo: https://github.com/pre-commit/mirrors-yapf
+    rev: v0.30.0
+    hooks:
+      - id: yapf
+        args: ["--style={column_limit=120}"]
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.1.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: check-merge-conflict
+      - id: fix-encoding-pragma
+        args: ["--remove"]
+      - id: mixed-line-ending
+        args: ["--fix=lf"]

VLMEvalKit/requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+decord; platform_machine != 'arm64'
+eva-decord; platform_machine == 'arm64'
+gradio
+huggingface_hub
+imageio
+matplotlib
+numpy
+omegaconf
+openai
+opencv-python>=4.4.0.46
+openpyxl
+pandas
+pillow
+portalocker
+protobuf
+python-dotenv
+requests
+rich
+sentencepiece
+setuptools
+sty
+tabulate
+tiktoken
+timeout-decorator
+torch
+tqdm
+transformers
+typing_extensions
+validators
+xlsxwriter

a_distributed_notebook/FSDP_tutorial.md ADDED Viewed

	@@ -0,0 +1,519 @@

+Getting Started with Fully Sharded Data Parallel(FSDP)
+======================================================
+**Author**: [Hamid Shojanazeri](https://github.com/HamidShojanazeri),
+[Yanli Zhao](https://github.com/zhaojuanmao), [Shen
+Li](https://mrshenli.github.io/)
+::: {.note}
+::: {.title}
+Note
+:::
+View and edit this tutorial in
+[github](https://github.com/pytorch/tutorials/blob/main/intermediate_source/FSDP_tutorial.rst).
+:::
+Training AI models at a large scale is a challenging task that requires
+a lot of compute power and resources. It also comes with considerable
+engineering complexity to handle the training of these very large
+models. [PyTorch
+FSDP](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/),
+released in PyTorch 1.11 makes this easier.
+In this tutorial, we show how to use [FSDP
+APIs](https://pytorch.org/docs/stable/fsdp.html), for simple MNIST
+models that can be extended to other larger models such as [HuggingFace
+BERT models](https://huggingface.co/blog/zero-deepspeed-fairscale), [GPT
+3 models up to 1T
+parameters](https://pytorch.medium.com/training-a-1-trillion-parameter-model-with-pytorch-fully-sharded-data-parallel-on-aws-3ac13aa96cff)
+. The sample DDP MNIST code has been borrowed from
+[here](https://github.com/yqhu/mnist_examples).
+How FSDP works
+--------------
+In
+[DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html),
+(DDP) training, each process/ worker owns a replica of the model and
+processes a batch of data, finally it uses all-reduce to sum up
+gradients over different workers. In DDP the model weights and optimizer
+states are replicated across all workers. FSDP is a type of data
+parallelism that shards model parameters, optimizer states and gradients
+across DDP ranks.
+When training with FSDP, the GPU memory footprint is smaller than when
+training with DDP across all workers. This makes the training of some
+very large models feasible by allowing larger models or batch sizes to
+fit on device. This comes with the cost of increased communication
+volume. The communication overhead is reduced by internal optimizations
+like overlapping communication and computation.
+![FSDP
+Workflow](/_static/img/distributed/fsdp_workflow.png){.align-center
+width="100.0%"}
+At a high level FSDP works as follow:
+*In constructor*
+-   Shard model parameters and each rank only keeps its own shard
+*In forward path*
+-   Run all\_gather to collect all shards from all ranks to recover the
+    full parameter in this FSDP unit
+-   Run forward computation
+-   Discard parameter shards it has just collected
+*In backward path*
+-   Run all\_gather to collect all shards from all ranks to recover the
+    full parameter in this FSDP unit
+-   Run backward computation
+-   Run reduce\_scatter to sync gradients
+-   Discard parameters.
+One way to view FSDP\'s sharding is to decompose the DDP gradient
+all-reduce into reduce-scatter and all-gather. Specifically, during the
+backward pass, FSDP reduces and scatters gradients, ensuring that each
+rank possesses a shard of the gradients. Then it updates the
+corresponding shard of the parameters in the optimizer step. Finally, in
+the subsequent forward pass, it performs an all-gather operation to
+collect and combine the updated parameter shards.
+![FSDP
+Allreduce](/_static/img/distributed/fsdp_sharding.png){.align-center
+width="100.0%"}
+How to use FSDP
+---------------
+Here we use a toy model to run training on the MNIST dataset for
+demonstration purposes. The APIs and logic can be applied to training
+larger models as well.
+*Setup*
+1.1 Install PyTorch along with Torchvision
+See the [Get Started guide](https://pytorch.org/get-started/locally/)
+for information on installation.
+We add the following code snippets to a python script "FSDP\_mnist.py".
+1.2 Import necessary packages
+::: {.note}
+::: {.title}
+Note
+:::
+This tutorial is intended for PyTorch versions 1.12 and later. If you
+are using an earlier version, replace all instances of
+[size\_based\_auto\_wrap\_policy]{.title-ref} with
+[default\_auto\_wrap\_policy]{.title-ref} and
+[fsdp\_auto\_wrap\_policy]{.title-ref} with
+[auto\_wrap\_policy]{.title-ref}.
+:::
+``` {.python}
+# Based on: https://github.com/pytorch/examples/blob/master/mnist/main.py
+import os
+import argparse
+import functools
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+from torch.optim.lr_scheduler import StepLR
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data.distributed import DistributedSampler
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    CPUOffload,
+    BackwardPrefetch,
+)
+from torch.distributed.fsdp.wrap import (
+    size_based_auto_wrap_policy,
+    enable_wrap,
+    wrap,
+)
+```
+1.3 Distributed training setup. As we mentioned FSDP is a type of data
+parallelism which requires a distributed training environment, so here
+we use two helper functions to initialize the processes for distributed
+training and clean up.
+``` {.python}
+def setup(rank, world_size):
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12355'
+    # initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+def cleanup():
+    dist.destroy_process_group()
+```
+2.1 Define our toy model for handwritten digit classification.
+``` {.python}
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+```
+2.2 Define a train function
+``` {.python}
+def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None):
+    model.train()
+    ddp_loss = torch.zeros(2).to(rank)
+    if sampler:
+        sampler.set_epoch(epoch)
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(rank), target.to(rank)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target, reduction='sum')
+        loss.backward()
+        optimizer.step()
+        ddp_loss[0] += loss.item()
+        ddp_loss[1] += len(data)
+    dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
+    if rank == 0:
+        print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1]))
+```
+2.3 Define a validation function
+``` {.python}
+def test(model, rank, world_size, test_loader):
+    model.eval()
+    correct = 0
+    ddp_loss = torch.zeros(3).to(rank)
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(rank), target.to(rank)
+            output = model(data)
+            ddp_loss[0] += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
+            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+            ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item()
+            ddp_loss[2] += len(data)
+    dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
+    if rank == 0:
+        test_loss = ddp_loss[0] / ddp_loss[2]
+        print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
+            test_loss, int(ddp_loss[1]), int(ddp_loss[2]),
+            100. * ddp_loss[1] / ddp_loss[2]))
+```
+2.4 Define a distributed train function that wraps the model in FSDP
+**Note: to save the FSDP model, we need to call the state\_dict on each
+rank then on Rank 0 save the overall states.**
+``` {.python}
+def fsdp_main(rank, world_size, args):
+    setup(rank, world_size)
+    transform=transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+    ])
+    dataset1 = datasets.MNIST('../data', train=True, download=True,
+                        transform=transform)
+    dataset2 = datasets.MNIST('../data', train=False,
+                        transform=transform)
+    sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True)
+    sampler2 = DistributedSampler(dataset2, rank=rank, num_replicas=world_size)
+    train_kwargs = {'batch_size': args.batch_size, 'sampler': sampler1}
+    test_kwargs = {'batch_size': args.test_batch_size, 'sampler': sampler2}
+    cuda_kwargs = {'num_workers': 2,
+                    'pin_memory': True,
+                    'shuffle': False}
+    train_kwargs.update(cuda_kwargs)
+    test_kwargs.update(cuda_kwargs)
+    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
+    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
+    my_auto_wrap_policy = functools.partial(
+        size_based_auto_wrap_policy, min_num_params=100
+    )
+    torch.cuda.set_device(rank)
+    init_start_event = torch.cuda.Event(enable_timing=True)
+    init_end_event = torch.cuda.Event(enable_timing=True)
+    model = Net().to(rank)
+    model = FSDP(model)
+    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
+    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
+    init_start_event.record()
+    for epoch in range(1, args.epochs + 1):
+        train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1)
+        test(model, rank, world_size, test_loader)
+        scheduler.step()
+    init_end_event.record()
+    if rank == 0:
+        print(f"CUDA event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec")
+        print(f"{model}")
+    if args.save_model:
+        # use a barrier to make sure training is done on all ranks
+        dist.barrier()
+        states = model.state_dict()
+        if rank == 0:
+            torch.save(states, "mnist_cnn.pt")
+    cleanup()
+```
+2.5 Finally, parse the arguments and set the main function
+``` {.python}
+if __name__ == '__main__':
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
+                        help='input batch size for training (default: 64)')
+    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
+                        help='input batch size for testing (default: 1000)')
+    parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                        help='number of epochs to train (default: 14)')
+    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
+                        help='learning rate (default: 1.0)')
+    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
+                        help='Learning rate step gamma (default: 0.7)')
+    parser.add_argument('--no-cuda', action='store_true', default=False,
+                        help='disables CUDA training')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--save-model', action='store_true', default=False,
+                        help='For Saving the current Model')
+    args = parser.parse_args()
+    torch.manual_seed(args.seed)
+    WORLD_SIZE = torch.cuda.device_count()
+    mp.spawn(fsdp_main,
+        args=(WORLD_SIZE, args),
+        nprocs=WORLD_SIZE,
+        join=True)
+```
+We have recorded cuda events to measure the time of FSDP model
+specifics. The CUDA event time was 110.85 seconds.
+``` {.bash}
+python FSDP_mnist.py
+CUDA event elapsed time on training loop 40.67462890625sec
+```
+Wrapping the model with FSDP, the model will look as follows, we can see
+the model has been wrapped in one FSDP unit. Alternatively, we will look
+at adding the auto\_wrap\_policy next and will discuss the differences.
+``` {.bash}
+FullyShardedDataParallel(
+(_fsdp_wrapped_module): FlattenParamsWrapper(
+    (_fpw_module): Net(
+    (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
+    (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
+    (dropout1): Dropout(p=0.25, inplace=False)
+    (dropout2): Dropout(p=0.5, inplace=False)
+    (fc1): Linear(in_features=9216, out_features=128, bias=True)
+    (fc2): Linear(in_features=128, out_features=10, bias=True)
+    )
+)
+)
+```
+The following is the peak memory usage from FSDP MNIST training on
+g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch
+Profiler.
+![FSDP Peak Memory
+Usage](/_static/img/distributed/FSDP_memory.gif){.align-center
+width="100.0%"}
+Applying *auto\_wrap\_policy* in FSDP otherwise, FSDP will put the
+entire model in one FSDP unit, which will reduce computation efficiency
+and memory efficiency. The way it works is that, suppose your model
+contains 100 Linear layers. If you do FSDP(model), there will only be
+one FSDP unit which wraps the entire model. In that case, the allgather
+would collect the full parameters for all 100 linear layers, and hence
+won\'t save CUDA memory for parameter sharding. Also, there is only one
+blocking allgather call for the all 100 linear layers, there will not be
+communication and computation overlapping between layers.
+To avoid that, you can pass in an auto\_wrap\_policy, which will seal
+the current FSDP unit and start a new one automatically when the
+specified condition is met (e.g., size limit). In that way you will have
+multiple FSDP units, and only one FSDP unit needs to collect full
+parameters at a time. E.g., suppose you have 5 FSDP units, and each
+wraps 20 linear layers. Then, in the forward, the 1st FSDP unit will
+allgather parameters for the first 20 linear layers, do computation,
+discard the parameters and then move on to the next 20 linear layers.
+So, at any point in time, each rank only materializes parameters/grads
+for 20 linear layers instead of 100.
+To do so in 2.4 we define the auto\_wrap\_policy and pass it to FSDP
+wrapper, in the following example, my\_auto\_wrap\_policy defines that a
+layer could be wrapped or sharded by FSDP if the number of parameters in
+this layer is larger than 100. If the number of parameters in this layer
+is smaller than 100, it will be wrapped with other small layers together
+by FSDP. Finding an optimal auto wrap policy is challenging, PyTorch
+will add auto tuning for this config in the future. Without an auto
+tuning tool, it is good to profile your workflow using different auto
+wrap policies experimentally and find the optimal one.
+``` {.python}
+my_auto_wrap_policy = functools.partial(
+        size_based_auto_wrap_policy, min_num_params=20000
+    )
+torch.cuda.set_device(rank)
+model = Net().to(rank)
+model = FSDP(model,
+    auto_wrap_policy=my_auto_wrap_policy)
+```
+Applying the auto\_wrap\_policy, the model would be as follows:
+``` {.bash}
+FullyShardedDataParallel(
+(_fsdp_wrapped_module): FlattenParamsWrapper(
+(_fpw_module): Net(
+  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
+  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
+  (dropout1): Dropout(p=0.25, inplace=False)
+  (dropout2): Dropout(p=0.5, inplace=False)
+  (fc1): FullyShardedDataParallel(
+    (_fsdp_wrapped_module): FlattenParamsWrapper(
+      (_fpw_module): Linear(in_features=9216, out_features=128, bias=True)
+    )
+  )
+  (fc2): Linear(in_features=128, out_features=10, bias=True)
+)
+)
+```
+``` {.bash}
+python FSDP_mnist.py
+CUDA event elapsed time on training loop 41.89130859375sec
+```
+The following is the peak memory usage from FSDP with auto\_wrap policy
+of MNIST training on a g4dn.12.xlarge AWS EC2 instance with 4 GPUs
+captured from PyTorch Profiler. It can be observed that the peak memory
+usage on each device is smaller compared to FSDP without auto wrap
+policy applied, from \~75 MB to 66 MB.
+![FSDP Peak Memory Usage using Auto\_wrap
+policy](/_static/img/distributed/FSDP_autowrap.gif){.align-center
+width="100.0%"}
+*CPU Off-loading*: In case the model is very large that even with FSDP
+wouldn\'t fit into GPUs, then CPU offload can be helpful here.
+Currently, only parameter and gradient CPU offload is supported. It can
+be enabled via passing in cpu\_offload=CPUOffload(offload\_params=True).
+Note that this currently implicitly enables gradient offloading to CPU
+in order for params and grads to be on the same device to work with the
+optimizer. This API is subject to change. The default is None in which
+case there will be no offloading.
+Using this feature may slow down the training considerably, due to
+frequent copying of tensors from host to device, but it could help
+improve memory efficiency and train larger scale models.
+In 2.4 we just add it to the FSDP wrapper
+``` {.python}
+model = FSDP(model,
+    auto_wrap_policy=my_auto_wrap_policy,
+    cpu_offload=CPUOffload(offload_params=True))
+```
+Compare it with DDP, if in 2.4 we just normally wrap the model in DPP,
+saving the changes in "DDP\_mnist.py".
+``` {.python}
+model = Net().to(rank)
+model = DDP(model)
+```
+``` {.bash}
+python DDP_mnist.py
+CUDA event elapsed time on training loop 39.77766015625sec
+```
+The following is the peak memory usage from DDP MNIST training on
+g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch
+profiler.
+![DDP Peak Memory Usage using Auto\_wrap
+policy](/_static/img/distributed/DDP_memory.gif){.align-center
+width="100.0%"}
+Considering the toy example and tiny MNIST model we defined here, we can
+observe the difference between peak memory usage of DDP and FSDP. In DDP
+each process holds a replica of the model, so the memory footprint is
+higher compared to FSDP which shards the model parameters, optimizer
+states and gradients over DDP ranks. The peak memory usage using FSDP
+with auto\_wrap policy is the lowest followed by FSDP and DDP.
+Also, looking at timings, considering the small model and running the
+training on a single machine, FSDP with and without auto\_wrap policy
+performed almost as fast as DDP. This example does not represent most of
+the real applications, for detailed analysis and comparison between DDP
+and FSDP please refer to this [blog
+post](https://pytorch.medium.com/6c8da2be180d) .

a_distributed_notebook/temp/all_gather.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import torch
+import torch.distributed as dist
+import os
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from pprint import pprint
+import time
+def my_print(*args, **kwargs):
+    if dist.get_rank() == 0:
+        print(*args, **kwargs)
+    else:
+        time.sleep(0.01)
+        print(*args, **kwargs)
+class AllGatherLB(torch.autograd.Function):
+    """
+    An autograd function that performs allgather on a tensor.
+    This function only performs local-backpropagation on a single GPU.
+    It has lower convergence and lower efficiency compared to global-backpropagation.
+    """
+    @staticmethod
+    def forward(ctx, tensor, rank, world_size):
+        output = [torch.empty_like(tensor) for _ in range(world_size)]
+        dist.all_gather(output, tensor)
+        ctx.rank = rank
+        ctx.batch_size = tensor.shape[0]
+        return torch.cat(output, 0)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (
+            grad_output[ctx.batch_size * ctx.rank : ctx.batch_size * (ctx.rank + 1)],
+            None,
+            None,
+        )
+class AllGatherGB(torch.autograd.Function):
+    """
+    An autograd function that performs allgather on a tensor.
+    Global-backprogation on all GPUs.
+    This function has higher convergence and higher efficiency compared to local-backpropagation.
+    This function is used as default for gather strategy.
+    """
+    @staticmethod
+    def forward(ctx, tensor):
+        world_size = dist.get_world_size()
+        output = [torch.empty_like(tensor) for _ in range(world_size)]
+        dist.all_gather(output, tensor)
+        ctx.world_size = world_size
+        return torch.cat(output, 0)
+    @staticmethod
+    def backward(ctx, grad_output):
+        batch_size = grad_output.shape[0] // ctx.world_size
+        rank = dist.get_rank()
+        in_grad = grad_output.clone()
+        my_print("Rank ", rank, " has in_grad before all reduce", in_grad)
+        dist.all_reduce(in_grad, op=dist.ReduceOp.SUM)
+        my_print("Rank ", rank, " has in_grad after all reduce", in_grad)
+        return (in_grad[batch_size * rank : batch_size * (rank + 1)],)
+all_gather = AllGatherGB.apply
+""" All-Reduce example."""
+def run(rank, size):
+    """ Simple collective communication. """
+    # group = dist.new_group([0, 1])
+    BATCH_SIZE = 4
+    LENGTH = 3
+    VECTOR_DIM = 2
+    tensor = torch.zeros(BATCH_SIZE, LENGTH, VECTOR_DIM) + (2 * rank - 2) # rank 0: -2, rank 1: 0
+    tensor.requires_grad = True
+    # tensor_list = [torch.zeros(4, 3) for _ in range(size)]
+    gather_tensor = all_gather(tensor)
+    # data
+    # print('Rank ', rank, ' has gather data ', gather_tensor)
+    # shape
+    print('Rank ', rank, ' has gather shape ', gather_tensor.shape)
+    loss = gather_tensor ** 2
+    # + random tensor
+    loss = loss.sum() + torch.rand(1, requires_grad=True)
+    loss.backward()
+    # In mathematically, the gradient of gather_tensor is 2 * gather_tensor
+    print('Rank ', rank, ' has final tensor grad ', tensor.grad)
+def init_process(rank, size, fn, backend='gloo'):
+    """ Initialize the distributed environment. """
+    os.environ['MASTER_ADDR'] = '127.0.0.1'
+    os.environ['MASTER_PORT'] = '29500'
+    dist.init_process_group(backend, rank=rank, world_size=size)
+    fn(rank, size)
+if __name__ == "__main__":
+    size = 2
+    processes = []
+    mp.set_start_method("spawn")
+    for rank in range(size):
+        p = mp.Process(target=init_process, args=(rank, size, run))
+        p.start()
+        processes.append(p)
+    for p in processes:
+        p.join()

a_distributed_notebook/temp/run_4.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import sys
+import tempfile
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.optim as optim
+import torch.multiprocessing as mp
+from torch.nn.parallel import DistributedDataParallel as DDP
+def setup(rank, world_size):
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12355'
+    # initialize the process group
+    dist.init_process_group("gloo", rank=rank, world_size=world_size)
+def cleanup():
+    dist.destroy_process_group()
+class ToyModel(nn.Module):
+    def __init__(self):
+        super(ToyModel, self).__init__()
+        self.net1 = nn.Linear(10, 10)
+        self.relu = nn.ReLU()
+        self.net2 = nn.Linear(10, 5)
+    def forward(self, x):
+        return self.net2(self.relu(self.net1(x)))
+def demo_basic(rank, world_size):
+    print(f"Running basic DDP example on rank {rank}.")
+    setup(rank, world_size)
+    # create model and move it to GPU with id rank
+    model = ToyModel().to(rank)
+    ddp_model = DDP(model, device_ids=[rank])
+    loss_fn = nn.MSELoss()
+    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+    optimizer.zero_grad()
+    outputs = ddp_model(torch.randn(20, 10))
+    labels = torch.randn(20, 5).to(rank)
+    loss_fn(outputs, labels).backward()
+    optimizer.step()
+    cleanup()
+    print(f"Finished running basic DDP example on rank {rank}.")
+def run_demo(demo_fn, world_size):
+    print(f"Running DDP example with {world_size} processes.")
+    mp.set_start_method("spawn")
+    mp.spawn(demo_fn,
+             args=(world_size,),
+             nprocs=world_size,
+             join=True)
+if __name__ == "__main__":
+    run_demo(demo_basic, 2)

a_main_folder/convert_hf_dataset.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

a_temp/deepseek_vl2.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

a_temp/docs.html ADDED Viewed

	@@ -0,0 +1,32 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <link type="text/css" rel="stylesheet" href="https://cdn.jsdelivr.net/npm/swagger-ui-dist@5/swagger-ui.css">
+    <link rel="shortcut icon" href="https://fastapi.tiangolo.com/img/favicon.png">
+    <title>FastAPI - Swagger UI</title>
+</head>
+<body>
+    <div id="swagger-ui">
+    </div>
+    <script src="https://cdn.jsdelivr.net/npm/swagger-ui-dist@5/swagger-ui-bundle.js"></script>
+    <!-- `SwaggerUIBundle` is now available on the page -->
+    <script>
+        const ui = SwaggerUIBundle({
+            url: '/openapi.json',
+            "dom_id": "#swagger-ui",
+            "layout": "BaseLayout",
+            "deepLinking": true,
+            "showExtensions": true,
+            "showCommonExtensions": true,
+            oauth2RedirectUrl: window.location.origin + '/docs/oauth2-redirect',
+            presets: [
+                SwaggerUIBundle.presets.apis,
+                SwaggerUIBundle.SwaggerUIStandalonePreset
+            ],
+        })
+    </script>
+</body>
+</html>

a_temp/example_image.jpg ADDED Viewed

a_temp/openapi.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"openapi":"3.1.0","info":{"title":"FastAPI","version":"0.1.0"},"paths":{"/health":{"get":{"summary":"Health","description":"Health check.","operationId":"health_health_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/tokenize":{"post":{"summary":"Tokenize","operationId":"tokenize_tokenize_post","requestBody":{"content":{"application/json":{"schema":{"anyOf":[{"$ref":"#/components/schemas/TokenizeCompletionRequest"},{"$ref":"#/components/schemas/TokenizeChatRequest"}],"title":"Request"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/detokenize":{"post":{"summary":"Detokenize","operationId":"detokenize_detokenize_post","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/DetokenizeRequest"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/models":{"get":{"summary":"Show Available Models","operationId":"show_available_models_v1_models_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/version":{"get":{"summary":"Show Version","operationId":"show_version_version_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/v1/chat/completions":{"post":{"summary":"Create Chat Completion","operationId":"create_chat_completion_v1_chat_completions_post","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ChatCompletionRequest"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/completions":{"post":{"summary":"Create Completion","operationId":"create_completion_v1_completions_post","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/CompletionRequest"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/embeddings":{"post":{"summary":"Create Embedding","operationId":"create_embedding_v1_embeddings_post","requestBody":{"content":{"application/json":{"schema":{"anyOf":[{"$ref":"#/components/schemas/EmbeddingCompletionRequest"},{"$ref":"#/components/schemas/EmbeddingChatRequest"}],"title":"Request"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/pooling":{"post":{"summary":"Create Pooling","operationId":"create_pooling_pooling_post","requestBody":{"content":{"application/json":{"schema":{"anyOf":[{"$ref":"#/components/schemas/EmbeddingCompletionRequest"},{"$ref":"#/components/schemas/EmbeddingChatRequest"}],"title":"Request"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/score":{"post":{"summary":"Create Score","operationId":"create_score_score_post","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ScoreRequest"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/v1/score":{"post":{"summary":"Create Score V1","operationId":"create_score_v1_v1_score_post","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ScoreRequest"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"Audio":{"properties":{"id":{"type":"string","title":"Id"}},"type":"object","required":["id"],"title":"Audio"},"AudioURL":{"properties":{"url":{"type":"string","title":"Url"}},"type":"object","required":["url"],"title":"AudioURL"},"BaseModel":{"properties":{},"type":"object","title":"BaseModel"},"ChatCompletionAssistantMessageParam":{"properties":{"role":{"type":"string","enum":["assistant"],"const":"assistant","title":"Role"},"audio":{"anyOf":[{"$ref":"#/components/schemas/Audio"},{"type":"null"}]},"content":{"anyOf":[{"type":"string"},{"items":{"anyOf":[{"$ref":"#/components/schemas/ChatCompletionContentPartTextParam"},{"$ref":"#/components/schemas/ChatCompletionContentPartRefusalParam"}]},"type":"array"},{"type":"null"}],"title":"Content"},"function_call":{"anyOf":[{"$ref":"#/components/schemas/FunctionCall"},{"type":"null"}]},"name":{"type":"string","title":"Name"},"refusal":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Refusal"},"tool_calls":{"items":{"$ref":"#/components/schemas/ChatCompletionMessageToolCallParam"},"type":"array","title":"Tool Calls"}},"type":"object","required":["role"],"title":"ChatCompletionAssistantMessageParam"},"ChatCompletionContentPartAudioParam":{"properties":{"audio_url":{"$ref":"#/components/schemas/AudioURL"},"type":{"type":"string","enum":["audio_url"],"const":"audio_url","title":"Type"}},"type":"object","required":["audio_url","type"],"title":"ChatCompletionContentPartAudioParam"},"ChatCompletionContentPartImageParam":{"properties":{"image_url":{"$ref":"#/components/schemas/ImageURL"},"type":{"type":"string","enum":["image_url"],"const":"image_url","title":"Type"}},"type":"object","required":["image_url","type"],"title":"ChatCompletionContentPartImageParam"},"ChatCompletionContentPartInputAudioParam":{"properties":{"input_audio":{"$ref":"#/components/schemas/InputAudio"},"type":{"type":"string","enum":["input_audio"],"const":"input_audio","title":"Type"}},"type":"object","required":["input_audio","type"],"title":"ChatCompletionContentPartInputAudioParam"},"ChatCompletionContentPartRefusalParam":{"properties":{"refusal":{"type":"string","title":"Refusal"},"type":{"type":"string","enum":["refusal"],"const":"refusal","title":"Type"}},"type":"object","required":["refusal","type"],"title":"ChatCompletionContentPartRefusalParam"},"ChatCompletionContentPartTextParam":{"properties":{"text":{"type":"string","title":"Text"},"type":{"type":"string","enum":["text"],"const":"text","title":"Type"}},"type":"object","required":["text","type"],"title":"ChatCompletionContentPartTextParam"},"ChatCompletionContentPartVideoParam":{"properties":{"video_url":{"$ref":"#/components/schemas/VideoURL"},"type":{"type":"string","enum":["video_url"],"const":"video_url","title":"Type"}},"type":"object","required":["video_url","type"],"title":"ChatCompletionContentPartVideoParam"},"ChatCompletionDeveloperMessageParam":{"properties":{"content":{"anyOf":[{"type":"string"},{"items":{"$ref":"#/components/schemas/ChatCompletionContentPartTextParam"},"type":"array"}],"title":"Content"},"role":{"type":"string","enum":["developer"],"const":"developer","title":"Role"},"name":{"type":"string","title":"Name"}},"type":"object","required":["content","role"],"title":"ChatCompletionDeveloperMessageParam"},"ChatCompletionFunctionMessageParam":{"properties":{"content":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Content"},"name":{"type":"string","title":"Name"},"role":{"type":"string","enum":["function"],"const":"function","title":"Role"}},"type":"object","required":["content","name","role"],"title":"ChatCompletionFunctionMessageParam"},"ChatCompletionMessageToolCallParam":{"properties":{"id":{"type":"string","title":"Id"},"function":{"$ref":"#/components/schemas/Function"},"type":{"type":"string","enum":["function"],"const":"function","title":"Type"}},"type":"object","required":["id","function","type"],"title":"ChatCompletionMessageToolCallParam"},"ChatCompletionNamedFunction":{"properties":{"name":{"type":"string","title":"Name"}},"additionalProperties":true,"type":"object","required":["name"],"title":"ChatCompletionNamedFunction"},"ChatCompletionNamedToolChoiceParam":{"properties":{"function":{"$ref":"#/components/schemas/ChatCompletionNamedFunction"},"type":{"type":"string","enum":["function"],"const":"function","title":"Type","default":"function"}},"additionalProperties":true,"type":"object","required":["function"],"title":"ChatCompletionNamedToolChoiceParam"},"ChatCompletionRequest":{"properties":{"messages":{"items":{"anyOf":[{"$ref":"#/components/schemas/ChatCompletionDeveloperMessageParam"},{"$ref":"#/components/schemas/ChatCompletionSystemMessageParam"},{"$ref":"#/components/schemas/ChatCompletionUserMessageParam"},{"$ref":"#/components/schemas/ChatCompletionAssistantMessageParam"},{"$ref":"#/components/schemas/ChatCompletionToolMessageParam"},{"$ref":"#/components/schemas/ChatCompletionFunctionMessageParam"},{"$ref":"#/components/schemas/CustomChatCompletionMessageParam"}]},"type":"array","title":"Messages"},"model":{"type":"string","title":"Model"},"frequency_penalty":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Frequency Penalty","default":0.0},"logit_bias":{"anyOf":[{"additionalProperties":{"type":"number"},"type":"object"},{"type":"null"}],"title":"Logit Bias"},"logprobs":{"anyOf":[{"type":"boolean"},{"type":"null"}],"title":"Logprobs","default":false},"top_logprobs":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Top Logprobs","default":0},"max_tokens":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Max Tokens","deprecated":true},"max_completion_tokens":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Max Completion Tokens"},"n":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"N","default":1},"presence_penalty":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Presence Penalty","default":0.0},"response_format":{"anyOf":[{"$ref":"#/components/schemas/ResponseFormat"},{"type":"null"}]},"seed":{"anyOf":[{"type":"integer","maximum":9.223372036854776e+18,"minimum":-9.223372036854776e+18},{"type":"null"}],"title":"Seed"},"stop":{"anyOf":[{"type":"string"},{"items":{"type":"string"},"type":"array"},{"type":"null"}],"title":"Stop"},"stream":{"anyOf":[{"type":"boolean"},{"type":"null"}],"title":"Stream","default":false},"stream_options":{"anyOf":[{"$ref":"#/components/schemas/StreamOptions"},{"type":"null"}]},"temperature":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Temperature"},"top_p":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Top P"},"tools":{"anyOf":[{"items":{"$ref":"#/components/schemas/ChatCompletionToolsParam"},"type":"array"},{"type":"null"}],"title":"Tools"},"tool_choice":{"anyOf":[{"type":"string","enum":["none"],"const":"none"},{"type":"string","enum":["auto"],"const":"auto"},{"$ref":"#/components/schemas/ChatCompletionNamedToolChoiceParam"},{"type":"null"}],"title":"Tool Choice","default":"none"},"parallel_tool_calls":{"anyOf":[{"type":"boolean"},{"type":"null"}],"title":"Parallel Tool Calls","default":false},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"best_of":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Best Of"},"use_beam_search":{"type":"boolean","title":"Use Beam Search","default":false},"top_k":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Top K"},"min_p":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Min P"},"repetition_penalty":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Repetition Penalty"},"length_penalty":{"type":"number","title":"Length Penalty","default":1.0},"stop_token_ids":{"anyOf":[{"items":{"type":"integer"},"type":"array"},{"type":"null"}],"title":"Stop Token Ids"},"include_stop_str_in_output":{"type":"boolean","title":"Include Stop Str In Output","default":false},"ignore_eos":{"type":"boolean","title":"Ignore Eos","default":false},"min_tokens":{"type":"integer","title":"Min Tokens","default":0},"skip_special_tokens":{"type":"boolean","title":"Skip Special Tokens","default":true},"spaces_between_special_tokens":{"type":"boolean","title":"Spaces Between Special Tokens","default":true},"truncate_prompt_tokens":{"anyOf":[{"type":"integer","minimum":1.0},{"type":"null"}],"title":"Truncate Prompt Tokens"},"prompt_logprobs":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Prompt Logprobs"},"echo":{"type":"boolean","title":"Echo","description":"If true, the new message will be prepended with the last message if they belong to the same role.","default":false},"add_generation_prompt":{"type":"boolean","title":"Add Generation Prompt","description":"If true, the generation prompt will be added to the chat template. This is a parameter used by chat template in tokenizer config of the model.","default":true},"continue_final_message":{"type":"boolean","title":"Continue Final Message","description":"If this is set, the chat will be formatted so that the final message in the chat is open-ended, without any EOS tokens. The model will continue this message rather than starting a new one. This allows you to \"prefill\" part of the model's response for it. Cannot be used at the same time as `add_generation_prompt`.","default":false},"add_special_tokens":{"type":"boolean","title":"Add Special Tokens","description":"If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).","default":false},"documents":{"anyOf":[{"items":{"additionalProperties":{"type":"string"},"type":"object"},"type":"array"},{"type":"null"}],"title":"Documents","description":"A list of dicts representing documents that will be accessible to the model if it is performing RAG (retrieval-augmented generation). If the template does not support RAG, this argument will have no effect. We recommend that each document should be a dict containing \"title\" and \"text\" keys."},"chat_template":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Chat Template","description":"A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one."},"chat_template_kwargs":{"anyOf":[{"type":"object"},{"type":"null"}],"title":"Chat Template Kwargs","description":"Additional kwargs to pass to the template renderer. Will be accessible by the chat template."},"guided_json":{"anyOf":[{"type":"string"},{"type":"object"},{"$ref":"#/components/schemas/BaseModel"},{"type":"null"}],"title":"Guided Json","description":"If specified, the output will follow the JSON schema."},"guided_regex":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Guided Regex","description":"If specified, the output will follow the regex pattern."},"guided_choice":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"null"}],"title":"Guided Choice","description":"If specified, the output will be exactly one of the choices."},"guided_grammar":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Guided Grammar","description":"If specified, the output will follow the context free grammar."},"guided_decoding_backend":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Guided Decoding Backend","description":"If specified, will override the default guided decoding backend of the server for this specific request. If set, must be either 'outlines' / 'lm-format-enforcer'"},"guided_whitespace_pattern":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Guided Whitespace Pattern","description":"If specified, will override the default whitespace pattern for guided json decoding."},"priority":{"type":"integer","title":"Priority","description":"The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.","default":0},"request_id":{"type":"string","title":"Request Id","description":"The request_id related to this request. If the caller does not set it, a random_uuid will be generated. This id is used through out the inference process and return in response."},"logits_processors":{"anyOf":[{"items":{"anyOf":[{"type":"string"},{"$ref":"#/components/schemas/LogitsProcessorConstructor"}]},"type":"array"},{"type":"null"}],"title":"Logits Processors","description":"A list of either qualified names of logits processors, or constructor objects, to apply when sampling. A constructor is a JSON object with a required 'qualname' field specifying the qualified name of the processor class/factory, and optional 'args' and 'kwargs' fields containing positional and keyword arguments. For example: {'qualname': 'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': {'param': 'value'}}."}},"additionalProperties":true,"type":"object","required":["messages","model"],"title":"ChatCompletionRequest"},"ChatCompletionSystemMessageParam":{"properties":{"content":{"anyOf":[{"type":"string"},{"items":{"$ref":"#/components/schemas/ChatCompletionContentPartTextParam"},"type":"array"}],"title":"Content"},"role":{"type":"string","enum":["system"],"const":"system","title":"Role"},"name":{"type":"string","title":"Name"}},"type":"object","required":["content","role"],"title":"ChatCompletionSystemMessageParam"},"ChatCompletionToolMessageParam":{"properties":{"content":{"anyOf":[{"type":"string"},{"items":{"$ref":"#/components/schemas/ChatCompletionContentPartTextParam"},"type":"array"}],"title":"Content"},"role":{"type":"string","enum":["tool"],"const":"tool","title":"Role"},"tool_call_id":{"type":"string","title":"Tool Call Id"}},"type":"object","required":["content","role","tool_call_id"],"title":"ChatCompletionToolMessageParam"},"ChatCompletionToolsParam":{"properties":{"type":{"type":"string","enum":["function"],"const":"function","title":"Type","default":"function"},"function":{"$ref":"#/components/schemas/FunctionDefinition"}},"additionalProperties":true,"type":"object","required":["function"],"title":"ChatCompletionToolsParam"},"ChatCompletionUserMessageParam":{"properties":{"content":{"anyOf":[{"type":"string"},{"items":{"anyOf":[{"$ref":"#/components/schemas/ChatCompletionContentPartTextParam"},{"$ref":"#/components/schemas/ChatCompletionContentPartImageParam"},{"$ref":"#/components/schemas/ChatCompletionContentPartInputAudioParam"}]},"type":"array"}],"title":"Content"},"role":{"type":"string","enum":["user"],"const":"user","title":"Role"},"name":{"type":"string","title":"Name"}},"type":"object","required":["content","role"],"title":"ChatCompletionUserMessageParam"},"CompletionRequest":{"properties":{"model":{"type":"string","title":"Model"},"prompt":{"anyOf":[{"items":{"type":"integer"},"type":"array"},{"items":{"items":{"type":"integer"},"type":"array"},"type":"array"},{"type":"string"},{"items":{"type":"string"},"type":"array"}],"title":"Prompt"},"best_of":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Best Of"},"echo":{"anyOf":[{"type":"boolean"},{"type":"null"}],"title":"Echo","default":false},"frequency_penalty":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Frequency Penalty","default":0.0},"logit_bias":{"anyOf":[{"additionalProperties":{"type":"number"},"type":"object"},{"type":"null"}],"title":"Logit Bias"},"logprobs":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Logprobs"},"max_tokens":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Max Tokens","default":16},"n":{"type":"integer","title":"N","default":1},"presence_penalty":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Presence Penalty","default":0.0},"seed":{"anyOf":[{"type":"integer","maximum":9.223372036854776e+18,"minimum":-9.223372036854776e+18},{"type":"null"}],"title":"Seed"},"stop":{"anyOf":[{"type":"string"},{"items":{"type":"string"},"type":"array"},{"type":"null"}],"title":"Stop"},"stream":{"anyOf":[{"type":"boolean"},{"type":"null"}],"title":"Stream","default":false},"stream_options":{"anyOf":[{"$ref":"#/components/schemas/StreamOptions"},{"type":"null"}]},"suffix":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Suffix"},"temperature":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Temperature"},"top_p":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Top P"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"use_beam_search":{"type":"boolean","title":"Use Beam Search","default":false},"top_k":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Top K"},"min_p":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Min P"},"repetition_penalty":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Repetition Penalty"},"length_penalty":{"type":"number","title":"Length Penalty","default":1.0},"stop_token_ids":{"anyOf":[{"items":{"type":"integer"},"type":"array"},{"type":"null"}],"title":"Stop Token Ids"},"include_stop_str_in_output":{"type":"boolean","title":"Include Stop Str In Output","default":false},"ignore_eos":{"type":"boolean","title":"Ignore Eos","default":false},"min_tokens":{"type":"integer","title":"Min Tokens","default":0},"skip_special_tokens":{"type":"boolean","title":"Skip Special Tokens","default":true},"spaces_between_special_tokens":{"type":"boolean","title":"Spaces Between Special Tokens","default":true},"truncate_prompt_tokens":{"anyOf":[{"type":"integer","minimum":1.0},{"type":"null"}],"title":"Truncate Prompt Tokens"},"allowed_token_ids":{"anyOf":[{"items":{"type":"integer"},"type":"array"},{"type":"null"}],"title":"Allowed Token Ids"},"prompt_logprobs":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Prompt Logprobs"},"add_special_tokens":{"type":"boolean","title":"Add Special Tokens","description":"If true (the default), special tokens (e.g. BOS) will be added to the prompt.","default":true},"response_format":{"anyOf":[{"$ref":"#/components/schemas/ResponseFormat"},{"type":"null"}],"description":"Similar to chat completion, this parameter specifies the format of output. Only {'type': 'json_object'}, {'type': 'json_schema'} or {'type': 'text' } is supported."},"guided_json":{"anyOf":[{"type":"string"},{"type":"object"},{"$ref":"#/components/schemas/BaseModel"},{"type":"null"}],"title":"Guided Json","description":"If specified, the output will follow the JSON schema."},"guided_regex":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Guided Regex","description":"If specified, the output will follow the regex pattern."},"guided_choice":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"null"}],"title":"Guided Choice","description":"If specified, the output will be exactly one of the choices."},"guided_grammar":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Guided Grammar","description":"If specified, the output will follow the context free grammar."},"guided_decoding_backend":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Guided Decoding Backend","description":"If specified, will override the default guided decoding backend of the server for this specific request. If set, must be one of 'outlines' / 'lm-format-enforcer'"},"guided_whitespace_pattern":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Guided Whitespace Pattern","description":"If specified, will override the default whitespace pattern for guided json decoding."},"priority":{"type":"integer","title":"Priority","description":"The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.","default":0},"logits_processors":{"anyOf":[{"items":{"anyOf":[{"type":"string"},{"$ref":"#/components/schemas/LogitsProcessorConstructor"}]},"type":"array"},{"type":"null"}],"title":"Logits Processors","description":"A list of either qualified names of logits processors, or constructor objects, to apply when sampling. A constructor is a JSON object with a required 'qualname' field specifying the qualified name of the processor class/factory, and optional 'args' and 'kwargs' fields containing positional and keyword arguments. For example: {'qualname': 'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': {'param': 'value'}}."}},"additionalProperties":true,"type":"object","required":["model","prompt"],"title":"CompletionRequest"},"CustomChatCompletionContentSimpleAudioParam":{"properties":{"audio_url":{"type":"string","title":"Audio Url"}},"type":"object","required":["audio_url"],"title":"CustomChatCompletionContentSimpleAudioParam","description":"A simpler version of the param that only accepts a plain audio_url.\n\nExample:\n{\n \"audio_url\": \"https://example.com/audio.mp3\"\n}"},"CustomChatCompletionContentSimpleImageParam":{"properties":{"image_url":{"type":"string","title":"Image Url"}},"type":"object","required":["image_url"],"title":"CustomChatCompletionContentSimpleImageParam","description":"A simpler version of the param that only accepts a plain image_url.\nThis is supported by OpenAI API, although it is not documented.\n\nExample:\n{\n \"image_url\": \"https://example.com/image.jpg\"\n}"},"CustomChatCompletionContentSimpleVideoParam":{"properties":{"video_url":{"type":"string","title":"Video Url"}},"type":"object","required":["video_url"],"title":"CustomChatCompletionContentSimpleVideoParam","description":"A simpler version of the param that only accepts a plain audio_url.\n\nExample:\n{\n \"video_url\": \"https://example.com/video.mp4\"\n}"},"CustomChatCompletionMessageParam":{"properties":{"role":{"type":"string","title":"Role"},"content":{"anyOf":[{"type":"string"},{"items":{"anyOf":[{"$ref":"#/components/schemas/ChatCompletionContentPartTextParam"},{"$ref":"#/components/schemas/ChatCompletionContentPartImageParam"},{"$ref":"#/components/schemas/ChatCompletionContentPartInputAudioParam"},{"$ref":"#/components/schemas/ChatCompletionContentPartAudioParam"},{"$ref":"#/components/schemas/ChatCompletionContentPartVideoParam"},{"$ref":"#/components/schemas/ChatCompletionContentPartRefusalParam"},{"$ref":"#/components/schemas/CustomChatCompletionContentSimpleImageParam"},{"$ref":"#/components/schemas/CustomChatCompletionContentSimpleAudioParam"},{"$ref":"#/components/schemas/CustomChatCompletionContentSimpleVideoParam"},{"type":"string"}]},"type":"array"}],"title":"Content"},"name":{"type":"string","title":"Name"},"tool_call_id":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Tool Call Id"},"tool_calls":{"anyOf":[{"items":{"$ref":"#/components/schemas/ChatCompletionMessageToolCallParam"},"type":"array"},{"type":"null"}],"title":"Tool Calls"}},"type":"object","required":["role"],"title":"CustomChatCompletionMessageParam","description":"Enables custom roles in the Chat Completion API."},"DetokenizeRequest":{"properties":{"model":{"type":"string","title":"Model"},"tokens":{"items":{"type":"integer"},"type":"array","title":"Tokens"}},"additionalProperties":true,"type":"object","required":["model","tokens"],"title":"DetokenizeRequest"},"EmbeddingChatRequest":{"properties":{"model":{"type":"string","title":"Model"},"messages":{"items":{"anyOf":[{"$ref":"#/components/schemas/ChatCompletionDeveloperMessageParam"},{"$ref":"#/components/schemas/ChatCompletionSystemMessageParam"},{"$ref":"#/components/schemas/ChatCompletionUserMessageParam"},{"$ref":"#/components/schemas/ChatCompletionAssistantMessageParam"},{"$ref":"#/components/schemas/ChatCompletionToolMessageParam"},{"$ref":"#/components/schemas/ChatCompletionFunctionMessageParam"},{"$ref":"#/components/schemas/CustomChatCompletionMessageParam"}]},"type":"array","title":"Messages"},"encoding_format":{"type":"string","enum":["float","base64"],"title":"Encoding Format","default":"float"},"dimensions":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Dimensions"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"truncate_prompt_tokens":{"anyOf":[{"type":"integer","minimum":1.0},{"type":"null"}],"title":"Truncate Prompt Tokens"},"additional_data":{"anyOf":[{},{"type":"null"}],"title":"Additional Data"},"add_special_tokens":{"type":"boolean","title":"Add Special Tokens","description":"If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).","default":false},"chat_template":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Chat Template","description":"A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one."},"chat_template_kwargs":{"anyOf":[{"type":"object"},{"type":"null"}],"title":"Chat Template Kwargs","description":"Additional kwargs to pass to the template renderer. Will be accessible by the chat template."},"priority":{"type":"integer","title":"Priority","description":"The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.","default":0}},"additionalProperties":true,"type":"object","required":["model","messages"],"title":"EmbeddingChatRequest"},"EmbeddingCompletionRequest":{"properties":{"model":{"type":"string","title":"Model"},"input":{"anyOf":[{"items":{"type":"integer"},"type":"array"},{"items":{"items":{"type":"integer"},"type":"array"},"type":"array"},{"type":"string"},{"items":{"type":"string"},"type":"array"}],"title":"Input"},"encoding_format":{"type":"string","enum":["float","base64"],"title":"Encoding Format","default":"float"},"dimensions":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Dimensions"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"truncate_prompt_tokens":{"anyOf":[{"type":"integer","minimum":1.0},{"type":"null"}],"title":"Truncate Prompt Tokens"},"additional_data":{"anyOf":[{},{"type":"null"}],"title":"Additional Data"},"add_special_tokens":{"type":"boolean","title":"Add Special Tokens","description":"If true (the default), special tokens (e.g. BOS) will be added to the prompt.","default":true},"priority":{"type":"integer","title":"Priority","description":"The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.","default":0}},"additionalProperties":true,"type":"object","required":["model","input"],"title":"EmbeddingCompletionRequest"},"Function":{"properties":{"arguments":{"type":"string","title":"Arguments"},"name":{"type":"string","title":"Name"}},"type":"object","required":["arguments","name"],"title":"Function"},"FunctionCall":{"properties":{"arguments":{"type":"string","title":"Arguments"},"name":{"type":"string","title":"Name"}},"type":"object","required":["arguments","name"],"title":"FunctionCall"},"FunctionDefinition":{"properties":{"name":{"type":"string","title":"Name"},"description":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Description"},"parameters":{"anyOf":[{"type":"object"},{"type":"null"}],"title":"Parameters"}},"additionalProperties":true,"type":"object","required":["name"],"title":"FunctionDefinition"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"ImageURL":{"properties":{"url":{"type":"string","title":"Url"},"detail":{"type":"string","enum":["auto","low","high"],"title":"Detail"}},"type":"object","required":["url"],"title":"ImageURL"},"InputAudio":{"properties":{"data":{"type":"string","title":"Data"},"format":{"type":"string","enum":["wav","mp3"],"title":"Format"}},"type":"object","required":["data","format"],"title":"InputAudio"},"JsonSchemaResponseFormat":{"properties":{"name":{"type":"string","title":"Name"},"description":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Description"},"schema":{"anyOf":[{"type":"object"},{"type":"null"}],"title":"Schema"},"strict":{"anyOf":[{"type":"boolean"},{"type":"null"}],"title":"Strict"}},"additionalProperties":true,"type":"object","required":["name"],"title":"JsonSchemaResponseFormat"},"LogitsProcessorConstructor":{"properties":{"qualname":{"type":"string","title":"Qualname"},"args":{"anyOf":[{"items":{},"type":"array"},{"type":"null"}],"title":"Args"},"kwargs":{"anyOf":[{"type":"object"},{"type":"null"}],"title":"Kwargs"}},"type":"object","required":["qualname"],"title":"LogitsProcessorConstructor"},"ResponseFormat":{"properties":{"type":{"type":"string","enum":["text","json_object","json_schema"],"title":"Type"},"json_schema":{"anyOf":[{"$ref":"#/components/schemas/JsonSchemaResponseFormat"},{"type":"null"}]}},"additionalProperties":true,"type":"object","required":["type"],"title":"ResponseFormat"},"ScoreRequest":{"properties":{"model":{"type":"string","title":"Model"},"text_1":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"}],"title":"Text 1"},"text_2":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"}],"title":"Text 2"},"truncate_prompt_tokens":{"anyOf":[{"type":"integer","minimum":1.0},{"type":"null"}],"title":"Truncate Prompt Tokens"},"additional_data":{"anyOf":[{},{"type":"null"}],"title":"Additional Data"},"priority":{"type":"integer","title":"Priority","description":"The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.","default":0}},"additionalProperties":true,"type":"object","required":["model","text_1","text_2"],"title":"ScoreRequest"},"StreamOptions":{"properties":{"include_usage":{"anyOf":[{"type":"boolean"},{"type":"null"}],"title":"Include Usage","default":true},"continuous_usage_stats":{"anyOf":[{"type":"boolean"},{"type":"null"}],"title":"Continuous Usage Stats","default":false}},"additionalProperties":true,"type":"object","title":"StreamOptions"},"TokenizeChatRequest":{"properties":{"model":{"type":"string","title":"Model"},"messages":{"items":{"anyOf":[{"$ref":"#/components/schemas/ChatCompletionDeveloperMessageParam"},{"$ref":"#/components/schemas/ChatCompletionSystemMessageParam"},{"$ref":"#/components/schemas/ChatCompletionUserMessageParam"},{"$ref":"#/components/schemas/ChatCompletionAssistantMessageParam"},{"$ref":"#/components/schemas/ChatCompletionToolMessageParam"},{"$ref":"#/components/schemas/ChatCompletionFunctionMessageParam"},{"$ref":"#/components/schemas/CustomChatCompletionMessageParam"}]},"type":"array","title":"Messages"},"add_generation_prompt":{"type":"boolean","title":"Add Generation Prompt","description":"If true, the generation prompt will be added to the chat template. This is a parameter used by chat template in tokenizer config of the model.","default":true},"continue_final_message":{"type":"boolean","title":"Continue Final Message","description":"If this is set, the chat will be formatted so that the final message in the chat is open-ended, without any EOS tokens. The model will continue this message rather than starting a new one. This allows you to \"prefill\" part of the model's response for it. Cannot be used at the same time as `add_generation_prompt`.","default":false},"add_special_tokens":{"type":"boolean","title":"Add Special Tokens","description":"If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).","default":false},"chat_template":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Chat Template","description":"A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one."},"chat_template_kwargs":{"anyOf":[{"type":"object"},{"type":"null"}],"title":"Chat Template Kwargs","description":"Additional kwargs to pass to the template renderer. Will be accessible by the chat template."}},"additionalProperties":true,"type":"object","required":["model","messages"],"title":"TokenizeChatRequest"},"TokenizeCompletionRequest":{"properties":{"model":{"type":"string","title":"Model"},"prompt":{"type":"string","title":"Prompt"},"add_special_tokens":{"type":"boolean","title":"Add Special Tokens","description":"If true (the default), special tokens (e.g. BOS) will be added to the prompt.","default":true}},"additionalProperties":true,"type":"object","required":["model","prompt"],"title":"TokenizeCompletionRequest"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"VideoURL":{"properties":{"url":{"type":"string","title":"Url"}},"type":"object","required":["url"],"title":"VideoURL"}}}}

a_temp/temp1.ipynb ADDED Viewed

	@@ -0,0 +1,330 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/dscilab_dungvo/workspace/bin/envs/lmdeploy/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import datasets, huggingface_hub\n",
+    "disk_path ='/dscilab_dungvo/workspace/BA-PRE_THESIS/dataset_pretraining/SYNTH-PEDES/annotation_english_vietnamese_processed'\n",
+    "dataset = datasets.load_from_disk(disk_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CUDA_VISIBLE_DEVICES=0 python inference.py --model_path \"deepseek-ai/deepseek-vl2-small\" --chunk_size 512\n",
+    "CUDA_VISIBLE_DEVICES=0,1,2 python inference.py --model_path \"deepseek-ai/deepseek-vl2\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Base64\n",
+    "import requests\n",
+    "from PIL import Image\n",
+    "from io import BytesIO\n",
+    "import base64\n",
+    "from openai import OpenAI\n",
+    "from langchain_community.llms import VLLMOpenAI\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "from langchain_core.messages import HumanMessage, SystemMessage\n",
+    "from langchain_core.prompts.chat import (\n",
+    "    ChatPromptTemplate,\n",
+    "    HumanMessagePromptTemplate,\n",
+    "    SystemMessagePromptTemplate,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "PORT = 19400\n",
+    "client = OpenAI(api_key=\"YOUR_API_KEY\", base_url=f\"http://0.0.0.0:{PORT}/v1\")\n",
+    "model_name = client.models.list().data[0].id\n",
+    "\n",
+    "inference_server_url = f\"http://0.0.0.0:{PORT}/v1\"\n",
+    "\n",
+    "llm = ChatOpenAI(\n",
+    "    model=model_name,\n",
+    "    openai_api_key=\"EMPTY\",\n",
+    "    openai_api_base=inference_server_url,\n",
+    "    max_tokens=2000,\n",
+    "    # temperature=0.1,\n",
+    "    # top_p=0.8,\n",
+    "    temperature=0.05,\n",
+    "    top_p=0.9,\n",
+    ")\n",
+    "\n",
+    "def make_message(pil_image):\n",
+    "\n",
+    "    # INSERT THIS ...\n",
+    "    buffered = BytesIO()\n",
+    "    pil_image.save(buffered, format=\"JPEG\")\n",
+    "    img_str = base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n",
+    "    img_str = str(img_str)\n",
+    "    message = HumanMessage(\n",
+    "        content=[\n",
+    "            {\"type\": \"text\", \"text\": \"Describe the image\"},\n",
+    "            {\"type\": \"image_url\", \"image_url\": {\"url\": 'data:image/jpeg;base64,' + img_str}},\n",
+    "        ],\n",
+    "    )\n",
+    "    return message\n",
+    "# response = llm.invoke([message], temperature=0.1, top_p=0.9)\n",
+    "# response\n",
+    "def get_answer(chain, message):\n",
+    "    response = chain.invoke([message], temperature=0.1, top_p=0.9)\n",
+    "    return response.content\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'OpenGVLab/InternVL2_5-8B-AWQ'"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "example_image = dataset[1000]['image']\n",
+    "message = make_message(example_image)\n",
+    "response = get_answer(message)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The image shows a person from behind walking on a tiled floor. The person is wearing a dark shirt and dark pants. The lighting is dim, and there is a bright screen or display in the background. The person appears to be holding something in their right hand.'"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[SystemMessage(content='You are a helpful assistant who is helping user to caption about the image related to person, taking from surveillance camera. Please provide the caption in detail.'),\n",
+       " HumanMessage(content='Describe the image')]"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "init_prompt = ChatPromptTemplate(\n",
+    "    [\n",
+    "        (\n",
+    "            \"system\",\n",
+    "            \"You are a helpful assistant who is helping the user write a clear prompt for guiding a Multimodal Large Language Model (MLLM) to describe the image.\",\n",
+    "        ),\n",
+    "        (\n",
+    "            \"user\",\n",
+    "            \"\"\"I want the MLLM to provide a detailed, fine-grained description of the image related to a person, taken from surveillance. The model must cover these aspects:\n",
+    "            - The gender, pose, appearance, and age of the person in the image.\n",
+    "            - The region of the head, face, and items such as hats, glasses, helmets, etc.\n",
+    "            - Characteristics of the upper body, such as a red shirt, blue and white jacket, etc.\n",
+    "            - Characteristics of the lower body, such as black jeans, white skirt, etc.\n",
+    "            - Characteristics of accessories the person is holding, such as a phone, bag, etc.\n",
+    "            - Characteristics of the bottom of the person, such as shoes, sandals, etc.\n",
+    "            - The location of the person and objects in the image, such as in the park, on the street, in the house, etc.\n",
+    "            - The transportation in the image, such as a car, bike, bus, etc.\n",
+    "            - The time of day or lighting conditions.\n",
+    "            - The weather conditions, such as sunny, rainy, etc.\n",
+    "            - Any notable actions or activities the person is engaged in.\n",
+    "            \n",
+    "            For the objects that occur in the image or on the person, please provide a detailed description of the object, such as the color, shape, size, and any other relevant details.\n",
+    "            Please generate three example templates to help the model describe the image in detail. For example:\n",
+    "            EX1: \"The [gender] [age] person is wearing a [color] [type of clothing] and holding a [object] in the [location]. [He/She] is standing next to a [object] and [object]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\n",
+    "            EX2: \"The [gender] [age] person is [action] while wearing a [color] [type of clothing]. [He/She] is holding a [object] and is located in the [location]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\n",
+    "            EX3: \"In the [location], the [gender] [age] person is seen wearing a [color] [type of clothing] and holding a [object]. [He/She] is next to a [object] and [object]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\n",
+    "            \"\"\"\n",
+    "        ),\n",
+    "        (\n",
+    "            \"user\",\n",
+    "            [\n",
+    "                {\n",
+    "                    \"type\": \"image_url\",\n",
+    "                    \"image_url\": {\"url\": \"data:image/jpeg;base64,{image_data}\"},\n",
+    "                }\n",
+    "            ],\n",
+    "        )\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "extract_prompt = ChatPromptTemplate(\n",
+    "    [\n",
+    "        (\n",
+    "            \"system\",\n",
+    "            \"You are a helpful assistant who is helping user to caption about the image related to person, taking from surveillance camera. Please provide the caption in detail.\",\n",
+    "        ),\n",
+    "        (\n",
+    "            \"user\",\n",
+    "            \"{guild}\"\n",
+    "            ,\n",
+    "        ),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "extract_prompt.format_messages(guild=\"Describe the image\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chain = init_prompt + llm \n",
+    "\n",
+    "response = chain.invoke()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[SystemMessage(content='You are a helpful assistant who is helping the user write a clear prompt for guiding a Multimodal Large Language Model (MLLM) to describe the image.'),\n",
+       " HumanMessage(content='I want the MLLM to provide a detailed, fine-grained description of the image related to a person, taken from surveillance. The model must cover these aspects:\\n            - The gender, pose, appearance, and age of the person in the image.\\n            - The region of the head, face, and items such as hats, glasses, helmets, etc.\\n            - Characteristics of the upper body, such as a red shirt, blue and white jacket, etc.\\n            - Characteristics of the lower body, such as black jeans, white skirt, etc.\\n            - Characteristics of accessories the person is holding, such as a phone, bag, etc.\\n            - Characteristics of the bottom of the person, such as shoes, sandals, etc.\\n            - The location of the person and objects in the image, such as in the park, on the street, in the house, etc.\\n            - The transportation in the image, such as a car, bike, bus, etc.\\n            - The time of day or lighting conditions.\\n            - The weather conditions, such as sunny, rainy, etc.\\n            - Any notable actions or activities the person is engaged in.\\n            \\n            For the objects that occur in the image or on the person, please provide a detailed description of the object, such as the color, shape, size, and any other relevant details.\\n            Please generate three example templates to help the model describe the image in detail. For example:\\n            EX1: \"The [gender] [age] person is wearing a [color] [type of clothing] and holding a [object] in the [location]. [He/She] is standing next to a [object] and [object]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\\n            EX2: \"The [gender] [age] person is [action] while wearing a [color] [type of clothing]. [He/She] is holding a [object] and is located in the [location]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\\n            EX3: \"In the [location], the [gender] [age] person is seen wearing a [color] [type of clothing] and holding a [object]. [He/She] is next to a [object] and [object]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\\n            '),\n",
+       " HumanMessage(content=[{'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCADwAFgDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDxIRnNPKGri2zE8jFSfZ8dawvoIzTGaXyzV9ocVCVIPIpILFKQlBjvUJdj1JNLLnzGz60ytkgHpM6EYP51o20wmBDgBuxrMq9psZknIAyqjLUpLQqKLFwpMQB7Gn2Qy2T2oKSGXyn/AJ16DeeA9L0zwiNRfWE+3GBZVgGCDk9K5KtaMLJ9TqhSurnK3lzFdtGRDsZEC5B60VSjy0645HrRVWQ9tCyY8N0pjJ0q9LAUySMH3qnJj1q2cEJ3IWQVEyKQae3XrTCpPc800aXRh3IxO3FRGr2oR4IcHrwRVEiuiJIsal3Cjua6XT9BmktYpGdI0fksW5C+uKwbTHnLkgEnAJ6V7P8ADrwZpfiS9kttXXz1t7RGXypSMMWOeRWVRttJG9NK12efTaJcxTFYWSZB0bcBmp3tdQlCLIkSIg6B8/jX0MfhF4R24FncKPa4b/GkHwh8IgcWlx/4ENUuinqy1XtsfPkNi4bkxg+xor32X4M+EpRjyrxP924NFV7PsT7RM8T8RG9+1XDXob7T5hEgbqDn2rmmLE8muq14/vZFbczFiSTySa5i6KwqzcgnsRUR1PCoYn2yuiJZoYZBI+SB2FQXerPcWqW6RhEj6HuaqOxYnJqI9K6Ixsd8bkZBPWkwKcaSrSLuJiuh8N+LNV8L3X2nS5/LkPUHlWHcEVz9AosmNSaPofwz8drecxwa9ZeWWIH2iA5H1KmvYbO7gv7SK7tpFkglUMjr0Ir4dimKEV9B/AzxZFcaXcaBcSYmt8zQgn7yHrj6cfnSaLWp7FNKsUTOTwoyaK47V9Zhae5tFyrPwzeoxRXO6h6FLBSlG7PBfELzGVnEbHLcYFczqpKukR6hcn6mu+mv7e0t5ZplVwikqCO9ea3dybid5GGNxJxmpoq+qPAo4eNJWiisxxTc0E03NdZ0oWkNANGaZQhFJ0oJppNAC5Oa2PDuv3fh7WbXUbRsSQPuI/vDuKxRT8kdDSZcXZn0tFqkGspBcxWwU3OJd2/PXnp2oriPhPqi3SjS5dzS2+ZFYH+Dj+RP60VyuLue9GceVWOH8Q3F26KCCsIOScVzrMSa9CimguU3owY453Y6VyWuwWyajItsqqMfNt6ZopNRVkeJKhJPVGMWycUlW4NOkntLm6U4S3ALcHucVUNdKdzJqwUUg6UpOKoBDSUUUDEpRz0ppBFX9KskutRtYpnKRSyojEdQCQM/rUt2GkbPgbX5fDvi2xvwQYlcJMB3jbhv5/pRX0toHwr8J6CEeHTlupl5Et0fMIPsDwKKhs2VVpWPmiKCUwSSpzHGQrMDWZMx3NXremaRYReGr25lt48MzOqEbsAZ7dq8puADK+Om49frXJTkdbxCqx2Og0WC3b4e64zxDzndcSdxg5riFzxXqGh2duPhjrVzPESY1YxsGxhu3868vXj610wdzhqQaeo7tSHpS0h6VsjG1hKKKKZQh5rS0mY299bTg5MciuB64OazsVasgfOXPTNZVNjWnue+P8bNVGRDoNuRnAZ5zz+Q4orxy4Sa4l/0YMqY+7uJx+dFY3ZTjG56lJOU8NzQJ90KUwfqRXkc64nkx616A2h37SyGctNEzFjG8+ee2OK5SwsVu/EMFkwwJLhYyPb0rlU9Ga0oWsdp4pjGk/CWG2C7WuWjUj1Od39K8dP3jkYOelev/F8lNE0yKP7izHC+mFxXkABPXrXVhneNxYrR2A0UdKK6zjENFFFACir9upx8vWqFaVodqk46rxWVQ0hubmiqGndZHUDbkE8UVmWc6iPMn3kJx70VjdG1z22SxnTcxAwOTXkTyNF4hZlOCLgEfmK9t1gyRaTdsh+cQvj67TXz/NM0kpkz8+ST9c152HbmmejOmoNHe/EqWeTwtYvI+4NOvGP9mvLB6mvWvFenXN54B01nuBuVldgV7EfWvOv7GduRPHj1r0MK1y2OHF6yMrHNIa1v7Gx1uox+FJ/ZEQ+9djH0rsucVjJFLWodNtuguz+Qpy6NFjIu8/RalsLGQc9a7rw34PfW9IjvheLCGJXaV9DisGLRUZtu6ST02ivSfA9rJBoLQlHVVmfbvGCRxXNXnZaHTQhdmbH8ObdOH1JyCeQFortvKOTRXE6kjs9mi7cXcUsbxupKMCpA9DXA614JtJY5JNM3pKQSI3bIY+ldS8jYpEckisaa5XobylzLUjug6+EpLORVWSOzZWyM4IBrwVriYtzK/wCdfQzBXd0cZVsgj1FfP2pIItRuIlxhJXUY/wB4134d7nDiI9SuZXJ5Zj9TSFieppKCK6zjsGaesjjgOwHsaZRkjmkwSPV/hXKqaXf7yxZphz9F/wDr13rXMQzwx+grifhfZyTeH55ET5fOOWz3GP8A61d6tlIB1FcVRNs9KilyoqG4XsjUVbNkcctRWXIzaxnSQEA0xY8YNXH6GosVmkCehHjBFeC69bva69fQyfeWZs/nmvfQMsBXg/ie4+1eJdQm27d07cZzjHH9K68PucuI2MuugNhDL4GhvVX9/FdOrn/ZI4/Wue71NHbTSW81wiZihx5jZ6ZOB+tdhyX0IaKKVSA2WGQBSRKPb/hFbsvhOZyRhrliB+Arv2jwK5b4Y2n2TwPZ5BzMzSHI9Tjj8q69sYrCW56VNe6imykUVI45NFSaGA8oAqEy5HXFQtKo71F5y461y2JjItNKY43kHJVSePpXz5M264kPqx/nXuF/Oh0y6BPBifI/A14WfvH6110DlxDA1raXIn9j6zCx5eGNlHuJB/jWTV7TuY9QX/p1J/JlrpOdMpUZOCB/FxSVPaQNdX1vboMvJKqAfU0CW59NeGrVrXwzpkDH547dR6dulaRYbR1xjg+/pUUOIrWFDlAqKj+qsBjIpWlO5txGQfmAP/j1YTR6NPYazcUVXlnVVO7r/P3oqLGhxhfIppYAVWLECk3n1rBHIpMr6+C/hzUNjYYQsePavIcc169qAM2mXUQ6vEw/SvIcYJHpXVS2Mql2wHFWrG4+zSysTw8Ekf8A30pFVqK2MkgrW8L2aah4o021dmVXnXJHtzWR3Fb3g60a98WadGjlGEobcO2Kb2KjufR/QsJcMV+ViOBjsaikVDGDkFl5Ge4pnmlU3Mh8xD5cik/eHY/kaGyF8pSPl5ic9x6VizvhsUrkBwpHRuUPofSiknYYZmGEYcj+6fWikWcSZc03zKrF6YZDXOcKLhYSBkJ+8Cv5ivJrqMwXcsRGCjkc/WvTN7dQeRXC+Jrcwaw8hOVmG8H+db0hS2MinpE8iSMo+WNdzH05xTK2La3hTwvcXbN++kkEYGOw5roMzG716F8JrVJtdvZTjfFANh9CT1rz016N8LrJ1e/1Vc7oAqqoH3h1NTLYuG566ylpVn9fkk/lSv8AKvk7gGGTGfWmG4TdC4OYboY3ehxVeeUspjwfNhO4c8kVkdqWglzKHG88AjEi+/rRVG7vUYLOobZJ8rADoaKm4XOI4703IVgaUjmmMKyOUkfiU8Vj+ItNi1C3Q+ZtmXhBjO72rZlZSykegrF1slo1wehrSDsxSVzjRbEZB7GmyM6x+VuPlg7tvbNX5MkkBCfpVZreSRvulfrW6kRylTGRXq3wnISwuhIF2TSlc+4HSvL/ACJAcYyfQV7T4G0pbLwuqfKZ2fznKnoccCiT0HBanSWgDJNpkn3ozuiJ7jtTXclUuwDviJDqO9F/JstrfUYVYyQHLgfxL3FR3Vwsdwk68wXA+cdgexrM676Fe5McUhA5imGR/smiq0+3zXtJfutyh9P/AK9FQK5yo4pDjFQPdIuT1qlLes/AXA9qzOd7k9zdeWVAHese7kMpbPep7hHdVIJzmnxWO/Bc4FWkMz7KFvOzUt6Q8y7h90YrSZYIDhetY935skzbFyatMkiih/06OTblEO5h7CvSvDmqy72nTKwykoQxzivOtMt72a62sQqdwRya7vSImj0OWLH72M8j3FNscdzp4byWO6e1bJjlBIyenrVO3kOLjTZm3GPJQ+x5GPpVe4u2msYryPh0IOAO3Qim38oja21CLkZCPkfwn/Cg15hQ0k9v8x/fxHAPvRTLl1inE6fdbqR6UVFxn//Z'}}])]"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def get_str_img(pil_image):\n",
+    "    buffered = BytesIO()\n",
+    "    pil_image.save(buffered, format=\"JPEG\")\n",
+    "    img_str = base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n",
+    "    img_str = str(img_str)\n",
+    "    return img_str\n",
+    "\n",
+    "\n",
+    "\n",
+    "init_prompt.format_messages(\n",
+    "    image_data=get_str_img(example_image)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "init_chain = init_prompt | llm\n",
+    "response = init_chain.invoke(input={\"image_data\": get_str_img(example_image)})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "AIMessage(content='To guide a Multimodal Large Language Model (MLLM) to provide a detailed description of the image, you can use the following templates. These templates are designed to cover all the aspects you mentioned, ensuring a comprehensive and clear description:\\n\\n### Template 1:\\n\"The [gender] [age] person is seen from behind, wearing a [color] [type of clothing] and [type of pants]. [He/She] is walking on a [surface] and appears to be in a [location]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\\n\\n### Template 2:\\n\"The [gender] [age] person is walking while wearing a [color] [type of clothing] and [type of pants]. [He/She] is holding a [object] and is located in the [location]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\\n\\n### Template 3:\\n\"In the [location], the [gender] [age] person is seen from behind, wearing a [color] [type of clothing] and [type of pants]. [He/She] is walking on a [surface] and appears to be in a [location]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\\n\\nThese templates provide a structured format for the MLLM to describe the image, ensuring that all relevant details are covered.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 350, 'prompt_tokens': 1649, 'total_tokens': 1999, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'OpenGVLab/InternVL2_5-8B-AWQ', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-eb1871d9-302e-45a6-a6c5-5b2f425f7c3b-0', usage_metadata={'input_tokens': 1649, 'output_tokens': 350, 'total_tokens': 1999})"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

a_temp/vllm_example.sh ADDED Viewed

	@@ -0,0 +1,412 @@

+# usage: vllm serve <model_tag> [options]
+# positional arguments:
+#   model_tag             The model tag to serve
+# options:
+#   --allow-credentials   allow credentials
+#   --allowed-headers ALLOWED_HEADERS
+#                         allowed headers
+#   --allowed-local-media-path ALLOWED_LOCAL_MEDIA_PATH
+#                         Allowing API requests to read local images or videos from directories
+#                         specified by the server file system. This is a security risk. Should only be
+#                         enabled in trusted environments.
+#   --allowed-methods ALLOWED_METHODS
+#                         allowed methods
+#   --allowed-origins ALLOWED_ORIGINS
+#                         allowed origins
+#   --api-key API_KEY     If provided, the server will require this key to be presented in the header.
+#   --block-size {8,16,32,64,128}
+#                         Token block size for contiguous chunks of tokens. This is ignored on neuron
+#                         devices and set to max-model-len. On CUDA devices, only block sizes up to 32
+#                         are supported. On HPU devices, block size defaults to 128.
+#   --chat-template CHAT_TEMPLATE
+#                         The file path to the chat template, or the template in single-line form for
+#                         the specified model
+#   --chat-template-content-format {auto,string,openai}
+#                         The format to render message content within a chat template. * "string" will
+#                         render the content as a string. Example: "Hello World" * "openai" will render
+#                         the content as a list of dictionaries, similar to OpenAI schema. Example:
+#                         [{"type": "text", "text": "Hello world!"}]
+#   --code-revision CODE_REVISION
+#                         The specific revision to use for the model code on Hugging Face Hub. It can
+#                         be a branch name, a tag name, or a commit id. If unspecified, will use the
+#                         default version.
+#   --collect-detailed-traces COLLECT_DETAILED_TRACES
+#                         Valid choices are model,worker,all. It makes sense to set this only if
+#                         --otlp-traces-endpoint is set. If set, it will collect detailed traces for
+#                         the specified modules. This involves use of possibly costly and or blocking
+#                         operations and hence might have a performance impact.
+#   --compilation-config COMPILATION_CONFIG, -O COMPILATION_CONFIG
+#                         torch.compile configuration for the model.When it is a number (0, 1, 2, 3),
+#                         it will be interpreted as the optimization level. NOTE: level 0 is the
+#                         default level without any optimization. level 1 and 2 are for internal
+#                         testing only. level 3 is the recommended level for production. To specify the
+#                         full compilation config, use a JSON string. Following the convention of
+#                         traditional compilers, using -O without space is also supported. -O3 is
+#                         equivalent to -O 3.
+#   --config CONFIG       Read CLI options from a config file.Must be a YAML with the following options
+#                         :https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-
+#                         reference
+#   --config-format {auto,hf,mistral}
+#                         The format of the model config to load. * "auto" will try to load the config
+#                         in hf format if available else it will try to load in mistral format
+#   --cpu-offload-gb CPU_OFFLOAD_GB
+#                         The space in GiB to offload to CPU, per GPU. Default is 0, which means no
+#                         offloading. Intuitively, this argument can be seen as a virtual way to
+#                         increase the GPU memory size. For example, if you have one 24 GB GPU and set
+#                         this to 10, virtually you can think of it as a 34 GB GPU. Then you can load a
+#                         13B model with BF16 weight, which requires at least 26GB GPU memory. Note
+#                         that this requires fast CPU-GPU interconnect, as part of the model is loaded
+#                         from CPU memory to GPU memory on the fly in each model forward pass.
+#   --device {auto,cuda,neuron,cpu,openvino,tpu,xpu,hpu}
+#                         Device type for vLLM execution.
+#   --disable-async-output-proc
+#                         Disable async output processing. This may result in lower performance.
+#   --disable-custom-all-reduce
+#                         See ParallelConfig.
+#   --disable-fastapi-docs
+#                         Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint
+#   --disable-frontend-multiprocessing
+#                         If specified, will run the OpenAI frontend server in the same process as the
+#                         model serving engine.
+#   --disable-log-requests
+#                         Disable logging requests.
+#   --disable-log-stats   Disable logging statistics.
+#   --disable-logprobs-during-spec-decoding [DISABLE_LOGPROBS_DURING_SPEC_DECODING]
+#                         If set to True, token log probabilities are not returned during speculative
+#                         decoding. If set to False, log probabilities are returned according to the
+#                         settings in SamplingParams. If not specified, it defaults to True. Disabling
+#                         log probabilities during speculative decoding reduces latency by skipping
+#                         logprob calculation in proposal sampling, target sampling, and after accepted
+#                         tokens are determined.
+#   --disable-mm-preprocessor-cache
+#                         If true, then disables caching of the multi-modal preprocessor/mapper. (not
+#                         recommended)
+#   --disable-sliding-window
+#                         Disables sliding window, capping to sliding window size
+#   --distributed-executor-backend {ray,mp}
+#                         Backend to use for distributed model workers, either "ray" or "mp"
+#                         (multiprocessing). If the product of pipeline_parallel_size and
+#                         tensor_parallel_size is less than or equal to the number of GPUs available,
+#                         "mp" will be used to keep processing on a single host. Otherwise, this will
+#                         default to "ray" if Ray is installed and fail otherwise. Note that tpu and
+#                         hpu only support Ray for distributed inference.
+#   --download-dir DOWNLOAD_DIR
+#                         Directory to download and load the weights, default to the default cache dir
+#                         of huggingface.
+#   --dtype {auto,half,float16,bfloat16,float,float32}
+#                         Data type for model weights and activations. * "auto" will use FP16 precision
+#                         for FP32 and FP16 models, and BF16 precision for BF16 models. * "half" for
+#                         FP16. Recommended for AWQ quantization. * "float16" is the same as "half". *
+#                         "bfloat16" for a balance between precision and range. * "float" is shorthand
+#                         for FP32 precision. * "float32" for FP32 precision.
+#   --enable-auto-tool-choice
+#                         Enable auto tool choice for supported models. Use --tool-call-parser to
+#                         specify which parser to use
+#   --enable-chunked-prefill [ENABLE_CHUNKED_PREFILL]
+#                         If set, the prefill requests can be chunked based on the
+#                         max_num_batched_tokens.
+#   --enable-lora         If True, enable handling of LoRA adapters.
+#   --enable-lora-bias    If True, enable bias for LoRA adapters.
+#   --enable-prefix-caching, --no-enable-prefix-caching
+#                         Enables automatic prefix caching. Use --no-enable-prefix-caching to disable
+#                         explicitly.
+#   --enable-prompt-adapter
+#                         If True, enable handling of PromptAdapters.
+#   --enable-prompt-tokens-details
+#                         If set to True, enable prompt_tokens_details in usage.
+#   --enable-request-id-headers
+#                         If specified, API server will add X-Request-Id header to responses. Caution:
+#                         this hurts performance at high QPS.
+#   --enforce-eager       Always use eager-mode PyTorch. If False, will use eager mode and CUDA graph
+#                         in hybrid for maximal performance and flexibility.
+#   --fully-sharded-loras
+#                         By default, only half of the LoRA computation is sharded with tensor
+#                         parallelism. Enabling this will use the fully sharded layers. At high
+#                         sequence length, max rank or tensor parallel size, this is likely faster.
+#   --generation-config GENERATION_CONFIG
+#                         The folder path to the generation config. Defaults to None, will use the
+#                         default generation config in vLLM. If set to 'auto', the generation config
+#                         will be automatically loaded from model. If set to a folder path, the
+#                         generation config will be loaded from the specified folder path.
+#   --gpu-memory-utilization GPU_MEMORY_UTILIZATION
+#                         The fraction of GPU memory to be used for the model executor, which can range
+#                         from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
+#                         utilization. If unspecified, will use the default value of 0.9. This is a
+#                         per-instance limit, and only applies to the current vLLM instance.It does not
+#                         matter if you have another vLLM instance running on the same GPU. For
+#                         example, if you have two vLLM instances running on the same GPU, you can set
+#                         the GPU memory utilization to 0.5 for each instance.
+#   --guided-decoding-backend {outlines,lm-format-enforcer,xgrammar}
+#                         Which engine will be used for guided decoding (JSON schema / regex etc) by
+#                         default. Currently support https://github.com/outlines-dev/outlines,
+#                         https://github.com/mlc-ai/xgrammar, and https://github.com/noamgat/lm-format-
+#                         enforcer. Can be overridden per request via guided_decoding_backend
+#                         parameter.
+#   --hf-overrides HF_OVERRIDES
+#                         Extra arguments for the HuggingFace config. This should be a JSON string that
+#                         will be parsed into a dictionary.
+#   --host HOST           host name
+#   --ignore-patterns IGNORE_PATTERNS
+#                         The pattern(s) to ignore when loading the model.Default to `original/**/*` to
+#                         avoid repeated loading of llama's checkpoints.
+#   --kv-cache-dtype {auto,fp8,fp8_e5m2,fp8_e4m3}
+#                         Data type for kv cache storage. If "auto", will use model data type. CUDA
+#                         11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports fp8
+#                         (=fp8_e4m3)
+#   --kv-transfer-config KV_TRANSFER_CONFIG
+#                         The configurations for distributed KV cache transfer. Should be a JSON
+#                         string.
+#   --limit-mm-per-prompt LIMIT_MM_PER_PROMPT
+#                         For each multimodal plugin, limit how many input instances to allow for each
+#                         prompt. Expects a comma-separated list of items, e.g.: `image=16,video=2`
+#                         allows a maximum of 16 images and 2 videos per prompt. Defaults to 1 for each
+#                         modality.
+#   --load-format {auto,pt,safetensors,npcache,dummy,tensorizer,sharded_state,gguf,bitsandbytes,mistral,runai_streamer}
+#                         The format of the model weights to load. * "auto" will try to load the
+#                         weights in the safetensors format and fall back to the pytorch bin format if
+#                         safetensors format is not available. * "pt" will load the weights in the
+#                         pytorch bin format. * "safetensors" will load the weights in the safetensors
+#                         format. * "npcache" will load the weights in pytorch format and store a numpy
+#                         cache to speed up the loading. * "dummy" will initialize the weights with
+#                         random values, which is mainly for profiling. * "tensorizer" will load the
+#                         weights using tensorizer from CoreWeave. See the Tensorize vLLM Model script
+#                         in the Examples section for more information. * "runai_streamer" will load
+#                         the Safetensors weights using Run:aiModel Streamer * "bitsandbytes" will load
+#                         the weights using bitsandbytes quantization.
+#   --logits-processor-pattern LOGITS_PROCESSOR_PATTERN
+#                         Optional regex pattern specifying valid logits processor qualified names that
+#                         can be passed with the `logits_processors` extra completion argument.
+#                         Defaults to None, which allows no processors.
+#   --long-lora-scaling-factors LONG_LORA_SCALING_FACTORS
+#                         Specify multiple scaling factors (which can be different from base model
+#                         scaling factor - see eg. Long LoRA) to allow for multiple LoRA adapters
+#                         trained with those scaling factors to be used at the same time. If not
+#                         specified, only adapters trained with the base model scaling factor are
+#                         allowed.
+#   --lora-dtype {auto,float16,bfloat16}
+#                         Data type for LoRA. If auto, will default to base model dtype.
+#   --lora-extra-vocab-size LORA_EXTRA_VOCAB_SIZE
+#                         Maximum size of extra vocabulary that can be present in a LoRA adapter (added
+#                         to the base model vocabulary).
+#   --lora-modules LORA_MODULES [LORA_MODULES ...]
+#                         LoRA module configurations in either 'name=path' formator JSON format.
+#                         Example (old format): 'name=path' Example (new format): '{"name": "name",
+#                         "local_path": "path", "base_model_name": "id"}'
+#   --max-cpu-loras MAX_CPU_LORAS
+#                         Maximum number of LoRAs to store in CPU memory. Must be >= than max_loras.
+#                         Defaults to max_loras.
+#   --max-log-len MAX_LOG_LEN
+#                         Max number of prompt characters or prompt ID numbers being printed in log.
+#                         Default: Unlimited
+#   --max-logprobs MAX_LOGPROBS
+#                         Max number of log probs to return logprobs is specified in SamplingParams.
+#   --max-lora-rank MAX_LORA_RANK
+#                         Max LoRA rank.
+#   --max-loras MAX_LORAS
+#                         Max number of LoRAs in a single batch.
+#   --max-model-len MAX_MODEL_LEN
+#                         Model context length. If unspecified, will be automatically derived from the
+#                         model config.
+#   --max-num-batched-tokens MAX_NUM_BATCHED_TOKENS
+#                         Maximum number of batched tokens per iteration.
+#   --max-num-seqs MAX_NUM_SEQS
+#                         Maximum number of sequences per iteration.
+#   --max-parallel-loading-workers MAX_PARALLEL_LOADING_WORKERS
+#                         Load model sequentially in multiple batches, to avoid RAM OOM when using
+#                         tensor parallel and large models.
+#   --max-prompt-adapter-token MAX_PROMPT_ADAPTER_TOKEN
+#                         Max number of PromptAdapters tokens
+#   --max-prompt-adapters MAX_PROMPT_ADAPTERS
+#                         Max number of PromptAdapters in a batch.
+#   --max-seq-len-to-capture MAX_SEQ_LEN_TO_CAPTURE
+#                         Maximum sequence length covered by CUDA graphs. When a sequence has context
+#                         length larger than this, we fall back to eager mode. Additionally for
+#                         encoder-decoder models, if the sequence length of the encoder input is larger
+#                         than this, we fall back to the eager mode.
+#   --middleware MIDDLEWARE
+#                         Additional ASGI middleware to apply to the app. We accept multiple
+#                         --middleware arguments. The value should be an import path. If a function is
+#                         provided, vLLM will add it to the server using @app.middleware('http'). If a
+#                         class is provided, vLLM will add it to the server using app.add_middleware().
+#   --mm-processor-kwargs MM_PROCESSOR_KWARGS
+#                         Overrides for the multimodal input mapping/processing, e.g., image processor.
+#                         For example: {"num_crops": 4}.
+#   --model MODEL         Name or path of the huggingface model to use.
+#   --model-loader-extra-config MODEL_LOADER_EXTRA_CONFIG
+#                         Extra config for model loader. This will be passed to the model loader
+#                         corresponding to the chosen load_format. This should be a JSON string that
+#                         will be parsed into a dictionary.
+#   --multi-step-stream-outputs [MULTI_STEP_STREAM_OUTPUTS]
+#                         If False, then multi-step will stream outputs at the end of all steps
+#   --ngram-prompt-lookup-max NGRAM_PROMPT_LOOKUP_MAX
+#                         Max size of window for ngram prompt lookup in speculative decoding.
+#   --ngram-prompt-lookup-min NGRAM_PROMPT_LOOKUP_MIN
+#                         Min size of window for ngram prompt lookup in speculative decoding.
+#   --num-gpu-blocks-override NUM_GPU_BLOCKS_OVERRIDE
+#                         If specified, ignore GPU profiling result and use this number of GPU blocks.
+#                         Used for testing preemption.
+#   --num-lookahead-slots NUM_LOOKAHEAD_SLOTS
+#                         Experimental scheduling config necessary for speculative decoding. This will
+#                         be replaced by speculative config in the future; it is present to enable
+#                         correctness tests until then.
+#   --num-scheduler-steps NUM_SCHEDULER_STEPS
+#                         Maximum number of forward steps per scheduler call.
+#   --num-speculative-tokens NUM_SPECULATIVE_TOKENS
+#                         The number of speculative tokens to sample from the draft model in
+#                         speculative decoding.
+#   --otlp-traces-endpoint OTLP_TRACES_ENDPOINT
+#                         Target URL to which OpenTelemetry traces will be sent.
+#   --override-neuron-config OVERRIDE_NEURON_CONFIG
+#                         Override or set neuron device configuration. e.g. {"cast_logits_dtype":
+#                         "bloat16"}.'
+#   --override-pooler-config OVERRIDE_POOLER_CONFIG
+#                         Override or set the pooling method for pooling models. e.g. {"pooling_type":
+#                         "mean", "normalize": false}.'
+#   --pipeline-parallel-size PIPELINE_PARALLEL_SIZE, -pp PIPELINE_PARALLEL_SIZE
+#                         Number of pipeline stages.
+#   --port PORT           port number
+#   --preemption-mode PREEMPTION_MODE
+#                         If 'recompute', the engine performs preemption by recomputing; If 'swap', the
+#                         engine performs preemption by block swapping.
+#   --prompt-adapters PROMPT_ADAPTERS [PROMPT_ADAPTERS ...]
+#                         Prompt adapter configurations in the format name=path. Multiple adapters can
+#                         be specified.
+#   --qlora-adapter-name-or-path QLORA_ADAPTER_NAME_OR_PATH
+#                         Name or path of the QLoRA adapter.
+#   --quantization {aqlm,awq,deepspeedfp,tpu_int8,fp8,fbgemm_fp8,modelopt,marlin,gguf,gptq_marlin_24,gptq_marlin,awq_marlin,gptq,compressed-tensors,bitsandbytes,qqq,hqq,experts_int8,neuron_quant,ipex,None}, -q {aqlm,awq,deepspeedfp,tpu_int8,fp8,fbgemm_fp8,modelopt,marlin,gguf,gptq_marlin_24,gptq_marlin,awq_marlin,gptq,compressed-tensors,bitsandbytes,qqq,hqq,experts_int8,neuron_quant,ipex,None}
+#                         Method used to quantize the weights. If None, we first check the
+#                         `quantization_config` attribute in the model config file. If that is None, we
+#                         assume the model weights are not quantized and use `dtype` to determine the
+#                         data type of the weights.
+#   --quantization-param-path QUANTIZATION_PARAM_PATH
+#                         Path to the JSON file containing the KV cache scaling factors. This should
+#                         generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache
+#                         scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2
+#                         (without scaling) is only supported on cuda version greater than 11.8. On
+#                         ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.
+#   --ray-workers-use-nsight
+#                         If specified, use nsight to profile Ray workers.
+#   --response-role RESPONSE_ROLE
+#                         The role name to return if `request.add_generation_prompt=true`.
+#   --return-tokens-as-token-ids
+#                         When --max-logprobs is specified, represents single tokens as strings of the
+#                         form 'token_id:{token_id}' so that tokens that are not JSON-encodable can be
+#                         identified.
+#   --revision REVISION   The specific model version to use. It can be a branch name, a tag name, or a
+#                         commit id. If unspecified, will use the default version.
+#   --root-path ROOT_PATH
+#                         FastAPI root_path when app is behind a path based routing proxy
+#   --rope-scaling ROPE_SCALING
+#                         RoPE scaling configuration in JSON format. For example,
+#                         {"rope_type":"dynamic","factor":2.0}
+#   --rope-theta ROPE_THETA
+#                         RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE theta
+#                         improves the performance of the scaled model.
+#   --scheduler-delay-factor SCHEDULER_DELAY_FACTOR
+#                         Apply a delay (of delay factor multiplied by previous prompt latency) before
+#                         scheduling next prompt.
+#   --scheduling-policy {fcfs,priority}
+#                         The scheduling policy to use. "fcfs" (first come first served, i.e. requests
+#                         are handled in order of arrival; default) or "priority" (requests are handled
+#                         based on given priority (lower value means earlier handling) and time of
+#                         arrival deciding any ties).
+#   --seed SEED           Random seed for operations.
+#   --served-model-name SERVED_MODEL_NAME [SERVED_MODEL_NAME ...]
+#                         The model name(s) used in the API. If multiple names are provided, the server
+#                         will respond to any of the provided names. The model name in the model field
+#                         of a response will be the first name in this list. If not specified, the
+#                         model name will be the same as the `--model` argument. Noted that this
+#                         name(s) will also be used in `model_name` tag content of prometheus metrics,
+#                         if multiple names provided, metrics tag will take the first one.
+#   --skip-tokenizer-init
+#                         Skip initialization of tokenizer and detokenizer
+#   --spec-decoding-acceptance-method {rejection_sampler,typical_acceptance_sampler}
+#                         Specify the acceptance method to use during draft token verification in
+#                         speculative decoding. Two types of acceptance routines are supported: 1)
+#                         RejectionSampler which does not allow changing the acceptance rate of draft
+#                         tokens, 2) TypicalAcceptanceSampler which is configurable, allowing for a
+#                         higher acceptance rate at the cost of lower quality, and vice versa.
+#   --speculative-disable-by-batch-size SPECULATIVE_DISABLE_BY_BATCH_SIZE
+#                         Disable speculative decoding for new incoming requests if the number of
+#                         enqueue requests is larger than this value.
+#   --speculative-disable-mqa-scorer
+#                         If set to True, the MQA scorer will be disabled in speculative and fall back
+#                         to batch expansion
+#   --speculative-draft-tensor-parallel-size SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE, -spec-draft-tp SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE
+#                         Number of tensor parallel replicas for the draft model in speculative
+#                         decoding.
+#   --speculative-max-model-len SPECULATIVE_MAX_MODEL_LEN
+#                         The maximum sequence length supported by the draft model. Sequences over this
+#                         length will skip speculation.
+#   --speculative-model SPECULATIVE_MODEL
+#                         The name of the draft model to be used in speculative decoding.
+#   --speculative-model-quantization {aqlm,awq,deepspeedfp,tpu_int8,fp8,fbgemm_fp8,modelopt,marlin,gguf,gptq_marlin_24,gptq_marlin,awq_marlin,gptq,compressed-tensors,bitsandbytes,qqq,hqq,experts_int8,neuron_quant,ipex,None}
+#                         Method used to quantize the weights of speculative model. If None, we first
+#                         check the `quantization_config` attribute in the model config file. If that
+#                         is None, we assume the model weights are not quantized and use `dtype` to
+#                         determine the data type of the weights.
+#   --ssl-ca-certs SSL_CA_CERTS
+#                         The CA certificates file
+#   --ssl-cert-reqs SSL_CERT_REQS
+#                         Whether client certificate is required (see stdlib ssl module's)
+#   --ssl-certfile SSL_CERTFILE
+#                         The file path to the SSL cert file
+#   --ssl-keyfile SSL_KEYFILE
+#                         The file path to the SSL key file
+#   --swap-space SWAP_SPACE
+#                         CPU swap space size (GiB) per GPU.
+#   --task {auto,generate,embedding,embed,classify,score,reward}
+#                         The task to use the model for. Each vLLM instance only supports one task,
+#                         even if the same model can be used for multiple tasks. When the model only
+#                         supports one task, "auto" can be used to select it; otherwise, you must
+#                         specify explicitly which task to use.
+#   --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE
+#                         Number of tensor parallel replicas.
+#   --tokenizer TOKENIZER
+#                         Name or path of the huggingface tokenizer to use. If unspecified, model name
+#                         or path will be used.
+#   --tokenizer-mode {auto,slow,mistral}
+#                         The tokenizer mode. * "auto" will use the fast tokenizer if available. *
+#                         "slow" will always use the slow tokenizer. * "mistral" will always use the
+#                         `mistral_common` tokenizer.
+#   --tokenizer-pool-extra-config TOKENIZER_POOL_EXTRA_CONFIG
+#                         Extra config for tokenizer pool. This should be a JSON string that will be
+#                         parsed into a dictionary. Ignored if tokenizer_pool_size is 0.
+#   --tokenizer-pool-size TOKENIZER_POOL_SIZE
+#                         Size of tokenizer pool to use for asynchronous tokenization. If 0, will use
+#                         synchronous tokenization.
+#   --tokenizer-pool-type TOKENIZER_POOL_TYPE
+#                         Type of tokenizer pool to use for asynchronous tokenization. Ignored if
+#                         tokenizer_pool_size is 0.
+#   --tokenizer-revision TOKENIZER_REVISION
+#                         Revision of the huggingface tokenizer to use. It can be a branch name, a tag
+#                         name, or a commit id. If unspecified, will use the default version.
+#   --tool-call-parser {granite-20b-fc,granite,hermes,internlm,jamba,llama3_json,mistral,pythonic} or name registered in --tool-parser-plugin
+#                         Select the tool call parser depending on the model that you're using. This is
+#                         used to parse the model-generated tool call into OpenAI API format. Required
+#                         for --enable-auto-tool-choice.
+#   --tool-parser-plugin TOOL_PARSER_PLUGIN
+#                         Special the tool parser plugin write to parse the model-generated tool into
+#                         OpenAI API format, the name register in this plugin can be used in --tool-
+#                         call-parser.
+#   --trust-remote-code   Trust remote code from huggingface.
+#   --typical-acceptance-sampler-posterior-alpha TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA
+#                         A scaling factor for the entropy-based threshold for token acceptance in the
+#                         TypicalAcceptanceSampler. Typically defaults to sqrt of --typical-acceptance-
+#                         sampler-posterior-threshold i.e. 0.3
+#   --typical-acceptance-sampler-posterior-threshold TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD
+#                         Set the lower bound threshold for the posterior probability of a token to be
+#                         accepted. This threshold is used by the TypicalAcceptanceSampler to make
+#                         sampling decisions during speculative decoding. Defaults to 0.09
+#   --use-v2-block-manager
+#                         [DEPRECATED] block manager v1 has been removed and SelfAttnBlockSpaceManager
+#                         (i.e. block manager v2) is now the default. Setting this flag to True or
+#                         False has no effect on vLLM behavior.
+#   --uvicorn-log-level {debug,info,warning,error,critical,trace}
+#                         log level for uvicorn
+#   --worker-cls WORKER_CLS
+#                         The worker class to use for distributed execution.
+#   --worker-use-ray      Deprecated, use --distributed-executor-backend=ray.
+#   -h, --help            show this help message and exit

groundingLMM/train.py ADDED Viewed

	@@ -0,0 +1,671 @@

+"""
+train.py - GLaMM Model Training on Mixed Datasets
+Trains the GLaMM model using Caption, Region, and Segmentation datasets with a random sampling approach. This method
+is crucial for developing a versatile model capable of handling diverse applications effectively.
+"""
+import os
+import sys
+import time
+import tqdm
+import random
+import torch
+import argparse
+import deepspeed
+import numpy as np
+import transformers
+from functools import partial
+from torch.utils.data import ConcatDataset
+from peft import LoraConfig, get_peft_model
+from torch.utils.tensorboard import SummaryWriter
+from model.GLaMM import GLaMMForCausalLM
+from model.llava import conversation as conversation_lib
+from dataset.dataset import custom_collate_fn, HybridSegDataset, HybridRegDataset, HybridCapDataset
+from tools.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN, AverageMeter, ProgressMeter, dict_to_cuda,
+                         Summary, intersectionAndUnionGPU)
+from dataset.segm_datasets.RefCOCO_Segm_ds import ReferSegmDataset
+from dataset.region_datasets.RefCOCO_VG_Region_ds import RefCocoGRegDataset, VisualGenomeRegDataset
+from dataset.caption_datasets.COCO_Caption_ds import CocoCapDataset
+from dataset.gcg_datasets.GranDf_gcg_ds import OpenPsgGCGDataset, Flickr30kGCGDataset, RefCOCOgGCGDataset
+def parse_args(args):
+    parser = argparse.ArgumentParser(description="GLaMM Model Training")
+    # Model-specific settings
+    parser.add_argument("--version", default="MBZUAI/GLaMM-GranD-Pretrained")
+    parser.add_argument("--vision_pretrained", default="./checkpoints/sam_vit_h_4b8939.pth", type=str)
+    parser.add_argument("--vision-tower", default="openai/clip-vit-large-patch14-336", type=str)
+    parser.add_argument("--conv_type", default="llava_v1", type=str, choices=["llava_v1", "llava_llama_2"])
+    parser.add_argument("--tune_mm_mlp_adapter", action="store_true")
+    parser.add_argument("--freeze_mm_mlp_adapter", action="store_true")
+    parser.add_argument("--mm_use_im_start_end", action="store_true", default=True)
+    parser.add_argument("--out_dim", default=256, type=int)
+    parser.add_argument("--image_size", default=1024, type=int, help="Image size for grounding image encoder")
+    parser.add_argument("--model_max_length", default=1536, type=int)
+    parser.add_argument("--lora_target_modules", default="q_proj,v_proj", type=str)
+    parser.add_argument("--with_region", action="store_true", default=True)
+    parser.add_argument("--mm_vision_select_layer", default=-2, type=int)
+    parser.add_argument("--pretrain_mm_mlp_adapter", default="", type=str)
+    parser.add_argument("--precision", default='bf16', type=str)
+    # Dataset settings
+    parser.add_argument("--use_cap_data", action="store_true", help="Use caption data")
+    parser.add_argument("--use_reg_data", action="store_true", help="Use region data")
+    parser.add_argument("--use_segm_data", action="store_true", help="Use segmentation data")
+    parser.add_argument("--weight_cap", default=0.15, type=float, help="Sampling weight for caption data")
+    parser.add_argument("--weight_reg", default=0.40, type=float, help="Sampling weight for region data")
+    parser.add_argument("--weight_segm", default=0.45, type=float, help="Sampling weight for segmentation data")
+    parser.add_argument("--dataset_dir", default="./data", type=str)
+    parser.add_argument("--seg_dataset", default="Semantic_Segm||Refer_Segm||RefCoco_GCG||PSG_GCG||Flickr_GCG||GranDf_GCG",
+                        type=str, help="Choose from: Semantic_Segm, Refer_Segm, RefCoco_GCG, GranDf_GCG, PSG_GCG, Flickr_GCG, GrandRefer_Segm")
+    parser.add_argument("--segm_sample_rates", default="5,4,3,3,3,1", type=str)
+    parser.add_argument("--reg_dataset", default="RefCoco_Reg||RefCocoG_Reg||RefCocoP_Reg||VisGen_Reg",
+                        type=str, help="Choose from: RefCoco_Reg, RefCocoG_Reg, RefCocoP_Reg, VisGen_Reg, Flickr_Reg, GrandRefer_Reg")
+    parser.add_argument("--reg_sample_rates", default="1,1,1,1", type=str)
+    parser.add_argument("--cap_dataset", default="CocoCap||LLaVaInstruct", type=str,
+                        help="Choose from: CocoCap, LLaVaInstruct, GrandCaptionDataset")
+    parser.add_argument("--cap_sample_rates", default="1,1", type=str)
+    parser.add_argument("--semantic_segm_data", default="ade20k||cocostuff||pascal_part||paco_lvis||mapillary", type=str)
+    parser.add_argument("--refer_segm_data", default="refcoco||refcoco+||refcocog||refclef", type=str)
+    parser.add_argument("--vqa_data", default="llava_instruct_150k", type=str)
+    parser.add_argument("--num_classes_per_sample", default=3, type=int)
+    # Training settings
+    parser.add_argument("--pretrained", action="store_true")
+    parser.add_argument("--resume", default="", type=str)
+    parser.add_argument("--auto_resume", action="store_true")
+    parser.add_argument("--weight", default="", type=str)
+    parser.add_argument("--lr", default=0.0003, type=float)
+    parser.add_argument("--epochs", default=10, type=int)
+    parser.add_argument("--steps_per_epoch", default=500, type=int)
+    parser.add_argument("--batch_size", default=2, type=int, help="batch size per device per step")
+    parser.add_argument("--grad_accumulation_steps", default=10, type=int)
+    parser.add_argument("--val_batch_size", default=1, type=int)
+    parser.add_argument("--workers", default=2, type=int)
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument("--lora_alpha", default=16, type=int)
+    parser.add_argument("--lora_dropout", default=0.05, type=float)
+    parser.add_argument("--ce_loss_weight", default=1.0, type=float)
+    parser.add_argument("--dice_loss_weight", default=0.5, type=float)
+    parser.add_argument("--bce_loss_weight", default=2.0, type=float)
+    parser.add_argument("--beta1", default=0.9, type=float)
+    parser.add_argument("--beta2", default=0.95, type=float)
+    parser.add_argument("--gradient_checkpointing", action="store_true", default=True)
+    parser.add_argument("--train_mask_decoder", action="store_true", default=True)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument("--print_freq", default=1, type=int)
+    parser.add_argument("--start_epoch", default=0, type=int)
+    parser.add_argument("--local_rank", default=0, type=int, help="node rank")
+    # Evaluation settings
+    parser.add_argument("--val_dataset", default="CocoCapVal|RefCOCOgRegVal|RefCOCOgSegmVal", type=str,
+                        help="Choose from: CocoCapVal, RefCOCOgRegVal, VisGenomeRegVal, RefCOCOgSegmVal, PsgGCGVal, "
+                             "RefCocoGCGVal, FlickrGCGVal")
+    parser.add_argument("--mask_validation", action="store_true")
+    parser.add_argument("--no_eval", action="store_true")
+    parser.add_argument("--eval_only", action="store_true")
+    # Experiment settings
+    parser.add_argument("--log_base_dir", default="./output", type=str)
+    parser.add_argument("--exp_name", default="GlamFinetuneOS", type=str)
+    return parser.parse_args(args)
+def initialize_environment(args):
+    """ Set up logging and model directories. """
+    args.log_dir = os.path.join(args.log_base_dir, args.exp_name)
+    if args.local_rank == 0:
+        os.makedirs(args.log_dir, exist_ok=True)
+        return SummaryWriter(args.log_dir)
+    return None
+def setup_tokenizer_and_special_tokens(args):
+    """ Load tokenizer and add special tokens. """
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        args.version, model_max_length=args.model_max_length, padding_side="right", use_fast=False
+    )
+    print('\033[92m' + "---- Initialized tokenizer from: {} ----".format(args.version) + '\033[0m')
+    tokenizer.pad_token = tokenizer.unk_token
+    if not args.pretrained:
+        if args.use_mm_start_end:
+            tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+            )
+        # modifications specific for regions
+        reg_tokens = ['<bbox>', '<point>']
+        # Adding special tokens for pixel grounding
+        segmentation_tokens = ['[SEG]']
+        # Adding tokens for GCG
+        phrase_tokens = ['<p>', '</p>']
+        special_tokens = reg_tokens + segmentation_tokens + phrase_tokens
+        tokenizer.add_tokens(special_tokens, special_tokens=True)
+    args.bbox_token_idx = tokenizer("<bbox>", add_special_tokens=False).input_ids[0]
+    args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+    args.bop_token_idx = tokenizer("<p>", add_special_tokens=False).input_ids[0]
+    args.eop_token_idx = tokenizer("</p>", add_special_tokens=False).input_ids[0]
+    return tokenizer
+def initialize_model(args, tokenizer):
+    """ Initialize the GLaMM model. """
+    model_args = {k: getattr(args, k) for k in
+                  ["train_mask_decoder", "out_dim", "ce_loss_weight", "dice_loss_weight", "bce_loss_weight",
+                   "seg_token_idx", "vision_pretrained", "vision_tower", "use_mm_start_end", "mm_vision_select_layer",
+                   "pretrain_mm_mlp_adapter", "tune_mm_mlp_adapter", "freeze_mm_mlp_adapter", "mm_use_im_start_end",
+                   "with_region", "bbox_token_idx", "eop_token_idx", "bop_token_idx"]}
+    model_args["num_level_reg_features"] = 4
+    model = GLaMMForCausalLM.from_pretrained(
+        args.version, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, **model_args
+    )
+    print('\033[92m' + "---- Initialized model from: {} ----".format(args.version) + '\033[0m')
+    # Configure model tokens
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.pad_token_id = tokenizer.pad_token_id
+    return model
+def prepare_model_for_training(model, tokenizer, args):
+    # Enable input gradients
+    model.enable_input_require_grads()
+    model.gradient_checkpointing_enable()
+    # Initialize vision tower
+    print(
+        '\033[92m' + "---- Initialized Global Image Encoder (vision tower) from: {} ----".format(
+            args.vision_tower
+        ) + '\033[0m'
+    )
+    model.get_model().initialize_vision_modules(model.get_model().config)
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(dtype=torch.bfloat16, device=args.local_rank)
+    # Initialize GLaMM model and adjust requires_grad
+    if not args.pretrained:
+        model.get_model().initialize_glamm_model(model.get_model().config)
+    else:
+        for param in model.get_model().grounding_encoder.parameters():
+            param.requires_grad = False
+        if model.get_model().config.train_mask_decoder:
+            model.get_model().grounding_encoder.mask_decoder.train()
+            for param in model.get_model().grounding_encoder.mask_decoder.parameters():
+                param.requires_grad = True
+        # Projection layer
+        model.get_model().text_hidden_fcs.train()
+        for param in model.get_model().text_hidden_fcs.parameters():
+            param.requires_grad = True
+    # Set requires_grad for vision tower and mm projector
+    for p in vision_tower.parameters():
+        p.requires_grad = False
+    for p in model.get_model().mm_projector.parameters():
+        p.requires_grad = False
+    # Set requires_grad based on LoRA training
+    lora_r = args.lora_r
+    if lora_r == 0:
+        for p in model.get_model().layers.parameters():
+            p.requires_grad = True
+        for p in model.get_model().mm_projector.parameters():
+            p.requires_grad = True
+    # Configure conversation library
+    conversation_lib.default_conversation = conversation_lib.conv_templates[args.conv_type]
+    # Configure LoRA if applicable
+    if lora_r > 0:
+        lora_config = setup_lora_config(model, args)
+        model = get_peft_model(model, lora_config)
+    # Resize token embeddings
+    model.resize_token_embeddings(len(tokenizer))
+    # Make certain modules trainable
+    set_trainable_modules(model)
+def setup_lora_config(model, args):
+    """ Configure LoRA settings for the model. """
+    def find_proj_layers(model, target_modules):
+        """ Identify projection layers in the model for LoRA adaptation. """
+        linear_cls = torch.nn.Linear
+        lora_module_names = set()
+        for name, module in model.named_modules():
+            if (isinstance(module, linear_cls) and all(
+                    x not in name for x in ["grounding_encoder", "vision_tower", "mm_projector", "text_hidden_fcs"]
+            ) and any(x in name for x in target_modules)):
+                lora_module_names.add(name)
+        return sorted(list(lora_module_names))
+    # Extracting LoRA target modules
+    lora_target_modules = args.lora_target_modules.split(",")
+    lora_module_names = find_proj_layers(model, lora_target_modules)
+    # Configuring LoRA
+    lora_config = LoraConfig(
+        r=args.lora_r, lora_alpha=args.lora_alpha, target_modules=lora_module_names, lora_dropout=args.lora_dropout,
+        bias="none", task_type="CAUSAL_LM"
+    )
+    return lora_config
+def set_trainable_modules(model):
+    """ Make specified modules in the model trainable. """
+    trainable_modules = ["lm_head", "embed_tokens", "mask_decoder", "text_hidden_fcs", "region_encoder"]
+    for name, param in model.named_parameters():
+        if any(module in name for module in trainable_modules):
+            print(f"Making trainable: {name}, Shape: {param.shape}")
+            param.requires_grad = True
+    def count_parameters(model):
+        total_params = sum(p.numel() for p in model.parameters())
+        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print('\033[92m' + "---- Total parameters: ----{}".format(total_params) + '\033[0m')
+        print('\033[92m' + "---- Trainable parameters: ----{}".format(trainable_params) + '\033[0m')
+    count_parameters(model)
+def initialize_datasets_and_loaders(args, tokenizer):
+    world_size = torch.cuda.device_count()
+    args.distributed = world_size > 1
+    # Common dataset arguments
+    common_ds_args = {"dataset_dir": args.dataset_dir, "tokenizer": tokenizer,
+                      "global_image_encoder": args.vision_tower,
+                      "epoch_samples": args.batch_size * args.grad_accumulation_steps * args.steps_per_epoch * world_size,
+                      "precision": args.precision, "image_size": args.image_size,
+                      "num_classes_per_sample": args.num_classes_per_sample}
+    # Training datasets
+    cap_train_dataset = HybridCapDataset(
+        **common_ds_args, dataset=args.cap_dataset, sample_rate=[float(x) for x in args.cap_sample_rates.split(",")],
+        batch_size=args.batch_size, ) if args.use_cap_data else None
+    reg_train_dataset = HybridRegDataset(
+        **common_ds_args, dataset=args.reg_dataset, sample_rate=[float(x) for x in args.reg_sample_rates.split(",")],
+        batch_size=args.batch_size, ) if args.use_reg_data else None
+    seg_train_dataset = HybridSegDataset(
+        **common_ds_args, dataset=args.seg_dataset, sample_rate=[float(x) for x in args.segm_sample_rates.split(",")],
+        semantic_segm_data=args.semantic_segm_data, refer_segm_data=args.refer_segm_data,
+        batch_size=args.batch_size, ) if args.use_segm_data else None
+    # Validation datasets
+    val_datasets = []
+    if not args.no_eval:
+        val_dataset_classes = {'CocoCapVal': CocoCapDataset,
+                               'RefCOCOgRegVal': RefCocoGRegDataset,
+                               'VisGenomeRegVal': VisualGenomeRegDataset,
+                               'RefCOCOgSegmVal': ReferSegmDataset,
+                               'PsgGCGVal': OpenPsgGCGDataset,
+                               'RefCocoGCGVal': RefCOCOgGCGDataset,
+                               'FlickrGCGVal': Flickr30kGCGDataset,
+                               }
+        for val_dataset_name in args.val_dataset.split('|'):
+            val_dataset_class = val_dataset_classes.get(val_dataset_name)
+            if val_dataset_class:
+                if val_dataset_class == ReferSegmDataset:
+                    # Modify this if other datasets in refer_segm_data need to be included in val
+                    refer_segm_data = 'refcocog'
+                    all_datasets = refer_segm_data.split("||")
+                    for d in all_datasets:
+                        val_dataset_class = val_dataset_class(
+                            **common_ds_args, validation=True, refer_segm_data=d, split='val'
+                        )
+                        val_dataset_class._set_len(len(val_dataset_class.refer_segm_data[d]['images']))
+                        val_datasets.append(val_dataset_class)
+                else:
+                    val_datasets.append(val_dataset_class(**common_ds_args, validation=True))
+    return cap_train_dataset, reg_train_dataset, seg_train_dataset, val_datasets
+def setup_data_loaders(args, cap_train_dataset, reg_train_dataset, seg_train_dataset, val_datasets, tokenizer):
+    sampler_args = {"shuffle": False, "drop_last": False}
+    train_loader_args = {"batch_size": args.batch_size, "shuffle": False, "num_workers": args.workers,
+                         "pin_memory": False}
+    val_loader_args = {"batch_size": args.val_batch_size, "shuffle": False, "num_workers": args.workers,
+                       "pin_memory": False}
+    collate_fn_args_train = partial(
+        custom_collate_fn, tokenizer=tokenizer, use_mm_start_end=args.use_mm_start_end, local_rank=args.local_rank,
+        inference=False
+    )
+    inference_mode = args.mask_validation
+    collate_fn_args_val = partial(
+        custom_collate_fn, tokenizer=tokenizer, use_mm_start_end=args.use_mm_start_end, local_rank=args.local_rank,
+        inference=inference_mode
+    )
+    # Training loaders
+    cap_train_loader = torch.utils.data.DataLoader(
+        cap_train_dataset, sampler=torch.utils.data.distributed.DistributedSampler(
+            cap_train_dataset, **sampler_args
+        ), collate_fn=collate_fn_args_train, **train_loader_args
+    ) if cap_train_dataset is not None else None
+    reg_train_loader = torch.utils.data.DataLoader(
+        reg_train_dataset, sampler=torch.utils.data.distributed.DistributedSampler(
+            reg_train_dataset, **sampler_args
+        ), collate_fn=collate_fn_args_train, **train_loader_args
+    ) if reg_train_dataset is not None else None
+    seg_train_loader = torch.utils.data.DataLoader(
+        seg_train_dataset, sampler=torch.utils.data.distributed.DistributedSampler(
+            seg_train_dataset, **sampler_args
+        ), collate_fn=collate_fn_args_train, **train_loader_args
+    ) if seg_train_dataset is not None else None
+    # Validation loader
+    val_loader = None
+    if val_datasets:
+        combined_val_datasets = ConcatDataset(val_datasets)
+        val_loader = torch.utils.data.DataLoader(
+            combined_val_datasets, **val_loader_args, collate_fn=collate_fn_args_val,
+            sampler=torch.utils.data.distributed.DistributedSampler(combined_val_datasets, **sampler_args), )
+    return cap_train_loader, reg_train_loader, seg_train_loader, val_loader
+def initialize_deepspeed(model, tokenizer, args):
+    ds_config = {"train_micro_batch_size_per_gpu": args.batch_size,
+                 "gradient_accumulation_steps": args.grad_accumulation_steps,
+                 "optimizer": {"type": "AdamW", "params": {"lr": args.lr, "weight_decay": 0.0,
+                                                           "betas": (args.beta1, args.beta2)}},
+                 "scheduler": {"type": "WarmupDecayLR",
+                               "params": {"total_num_steps": args.epochs * args.steps_per_epoch, "warmup_min_lr": 0,
+                                          "warmup_max_lr": args.lr, "warmup_num_steps": 100, "warmup_type": "linear"}},
+                 "fp16": {"enabled": args.precision == "fp16"}, "bf16": {"enabled": args.precision == "bf16"},
+                 "gradient_clipping": 1.0,
+                 "zero_optimization": {"stage": 2, "contiguous_gradients": True, "overlap_comm": True,
+                                       "reduce_scatter": True, "reduce_bucket_size": 5e8,
+                                       "allgather_bucket_size": 5e8}, }
+    model_engine, optimizer, _, scheduler = deepspeed.initialize(
+        model=model, model_parameters=model.parameters(), collate_fn=partial(
+            custom_collate_fn, tokenizer=tokenizer, use_mm_start_end=args.use_mm_start_end, local_rank=args.local_rank
+        ), config=ds_config
+    )
+    return model_engine, optimizer, scheduler
+def resume_training_from_checkpoint(model_engine, args):
+    if args.auto_resume and not args.resume:
+        resume = os.path.join(args.log_dir, "ckpt_model")
+        if os.path.exists(resume):
+            args.resume = resume
+    if args.resume:
+        load_path, client_state = model_engine.load_checkpoint(args.resume)
+        with open(os.path.join(args.resume, "latest"), "r") as f:
+            ckpt_dir = f.readlines()[0].strip()
+        args.start_epoch = int(ckpt_dir.replace("global_step", "")) // args.steps_per_epoch
+        print(f"Resume training from {args.resume}, start from epoch {args.start_epoch}")
+def main(args):
+    tokenizer = setup_tokenizer_and_special_tokens(args)
+    model = initialize_model(args, tokenizer)
+    prepare_model_for_training(model, tokenizer, args)
+    model_engine, optimizer, scheduler = initialize_deepspeed(model, tokenizer, args)
+    resume_training_from_checkpoint(model_engine, args)
+    cap_train_dataset, reg_train_dataset, seg_train_dataset, val_datasets = (
+        initialize_datasets_and_loaders(args, tokenizer))
+    cap_train_loader, reg_train_loader, seg_train_loader, val_loader = (
+        setup_data_loaders(args, cap_train_dataset, reg_train_dataset, seg_train_dataset, val_datasets, tokenizer))
+    # Determine active datasets and their weights
+    active_dataloaders = []
+    weights = []
+    if args.use_cap_data:
+        active_dataloaders.append(('cap', cap_train_loader))
+        weights.append(args.weight_cap)
+    if args.use_reg_data:
+        active_dataloaders.append(('reg', reg_train_loader))
+        weights.append(args.weight_reg)
+    if args.use_segm_data:
+        active_dataloaders.append(('seg', seg_train_loader))
+        weights.append(args.weight_segm)
+    # Assert that at least one dataset is active
+    assert active_dataloaders, "Error: At least one dataset (segm, reg, or cap) must be active."
+    dataset_iters = {'cap': iter(cap_train_loader) if args.use_cap_data else None,
+                     'reg': iter(reg_train_loader) if args.use_reg_data else None,
+                     'seg': iter(seg_train_loader) if args.use_segm_data else None, }
+    writer = initialize_environment(args)
+    if args.eval_only:
+        cur_val_loss = validate_model_performance(val_loader, model_engine, 0, writer, args)[0]
+        exit()
+    epoch_seeds = [random.randint(0, 100000) for _ in range(args.epochs)]
+    dataset_choices = [idx for idx, _ in enumerate(active_dataloaders)]
+    best_giou, best_ciou, best_val_loss = 0.0, 0.0, np.inf
+    for epoch in range(args.start_epoch, args.epochs):
+        random.seed(epoch_seeds[epoch])
+        step_choices = random.choices(dataset_choices, weights=weights, k=args.steps_per_epoch)
+        dataset_iters = train(
+            active_dataloaders, model_engine, epoch, scheduler, writer, dataset_iters, args, step_choices
+        )
+        if args.mask_validation:
+            giou, ciou = validate_model_performance(val_loader, model_engine, epoch, writer, args)
+            is_best = giou > best_giou
+            best_giou = max(giou, best_giou)
+            best_ciou = ciou if is_best else best_ciou
+            if args.local_rank == 0:  # Log the progress
+                print(f"Epoch: {epoch}, giou: {giou}, ciou: {ciou}, best_giou: {best_giou}, best_ciou: {best_ciou}")
+            save_checkpoint(model_engine, args, epoch, 'giou-ciou', f"{giou:.4f}-{ciou:.4f}", is_best)
+        else:
+            cur_val_loss = validate_model_performance(val_loader, model_engine, epoch, writer, args)
+            is_best = cur_val_loss < best_val_loss
+            best_val_loss = min(cur_val_loss, best_val_loss)
+            if args.local_rank == 0:  # Log the progress
+                print(f"Epoch: {epoch}, Current Validation Loss: {cur_val_loss:.4f}, Best Validation Loss: {best_val_loss:}")
+            save_checkpoint(model_engine, args, epoch, 'loss', f"{cur_val_loss:.4f}", is_best)
+def save_checkpoint(model_engine, args, epoch, metric_name, metric_value, is_best):
+    """ Saves the model checkpoint. """
+    # If the checkpoint is the best, save it in ckpt_model_best, else in ckpt_model_last_epoch
+    save_dir_name = "ckpt_model_best" if is_best else "ckpt_model_last_epoch"
+    save_dir = os.path.join(args.log_dir, save_dir_name)
+    # Ensure the directory exists
+    if args.local_rank == 0:
+        os.makedirs(save_dir, exist_ok=True)
+        ckpt_filename = f"epoch_{epoch}_val_{metric_name}_{metric_value}.pth"
+        torch.save({"epoch": epoch, f"val_{metric_name}": metric_value}, os.path.join(save_dir, ckpt_filename))
+    torch.distributed.barrier()
+    model_engine.save_checkpoint(save_dir)
+def train(active_datasets, model, epoch, scheduler, writer, dataset_iters, args, step_choices):
+    """Main training loop."""
+    def get_next_input(iterator, data_loader):
+        """Retrieve next input from the iterator, or reinitialize if necessary."""
+        try:
+            return next(iterator), iterator
+        except StopIteration:
+            new_iterator = iter(data_loader)
+            return next(new_iterator), new_iterator
+    def log_progress():
+        """Log training progress."""
+        if global_step % args.print_freq == 0:
+            if args.distributed:
+                for tracker in trackers.values():
+                    tracker.all_reduce()
+            if args.local_rank == 0:
+                progress.display(global_step + 1)
+                for key, tracker in trackers.items():
+                    writer.add_scalar(f"train/{key}", tracker.avg, global_step)
+                writer.add_scalar("metrics/total_secs_per_batch", batch_time.avg, global_step)
+                writer.add_scalar("metrics/data_secs_per_batch", data_time.avg, global_step)
+            for tracker in trackers.values():
+                tracker.reset()
+    batch_time = AverageMeter("Time", ":.4f")
+    data_time = AverageMeter("Data", ":.4f")
+    trackers = {"loss": AverageMeter("Loss", ":.4f"),
+                "ce_loss": AverageMeter("CeLoss", ":.4f"),
+                "mask_bce_loss": AverageMeter("MaskBCELoss", ":.4f"),
+                "mask_dice_loss": AverageMeter("MaskDICELoss", ":.4f"),
+                "mask_loss": AverageMeter("MaskLoss", ":.4f")}
+    progress = ProgressMeter(args.steps_per_epoch, list(trackers.values()), prefix=f"Epoch: [{epoch}]")
+    model.train()
+    end = time.time()
+    for global_step in range(args.steps_per_epoch):
+        for _ in range(args.grad_accumulation_steps):
+            # Select data loader based on step choice
+            dataset_type, data_loader = active_datasets[step_choices[global_step]]
+            data_batch, new_iter = get_next_input(dataset_iters[dataset_type], data_loader)
+            dataset_iters[dataset_type] = new_iter
+            data_time.update(time.time() - end)
+            # Prepare data and convert relevant tensors to bfloat16
+            data_batch = dict_to_cuda(data_batch)
+            for key in ["global_enc_images", "grounding_enc_images"]:
+                if data_batch[key] is not None:
+                    data_batch[key] = data_batch[key].bfloat16()
+            output_dict = model(**data_batch)
+            # Update training metrics
+            for key, tracker in trackers.items():
+                if key in output_dict:
+                    tracker.update(output_dict[key].item(), data_batch["global_enc_images"].size(0))
+            model.backward(output_dict["loss"])
+            model.step()
+        batch_time.update(time.time() - end)
+        end = time.time()
+        log_progress()
+        if global_step != 0:
+            curr_lr = scheduler.get_last_lr()
+            if args.local_rank == 0:
+                writer.add_scalar("train/lr", curr_lr[0], global_step)
+    return dataset_iters
+def validate_model_performance(validation_loader, training_model, current_epoch, tensorboard_writer, args):
+    if args.mask_validation:
+        # For use with only segmentation/GCG type datasets
+        trackers = {"intersection": AverageMeter("Intersec", ":.4f", Summary.SUM),
+                    "union": AverageMeter("Union", ":.4f", Summary.SUM),
+                    "gIoU": AverageMeter("gIoU", ":.4f", Summary.SUM)}
+        training_model.eval()
+        for data_batch in tqdm.tqdm(validation_loader):
+            # Prepare data and convert relevant tensors to bfloat16
+            data_batch = dict_to_cuda(data_batch)
+            for key in ["global_enc_images", "grounding_enc_images"]:
+                data_batch[key] = data_batch[key].bfloat16()
+            torch.cuda.empty_cache()
+            # Model inference without gradient tracking
+            with torch.no_grad():
+                results = training_model(**data_batch)
+            predictions = results["pred_masks"]
+            gt_masks = results["gt_masks"][0].int()
+            # Note: An error at this line may suggest that the dataset used for validation does not support
+            # segmentation tasks. Ensure that the dataset is appropriate for segmentation analysis.
+            predicted_masks = (predictions[0] > 0).int()
+            assert len(predictions) == 1
+            intersection, union, accuracy_iou = 0.0, 0.0, 0.0
+            for target, prediction in zip(gt_masks, predicted_masks):
+                intersect, union_, _ = intersectionAndUnionGPU(
+                    prediction.contiguous().clone(), target.contiguous(), 2, ignore_index=255
+                )
+                intersection += intersect
+                union += union_
+                accuracy_iou += intersect / (union_ + 1e-5)
+                # handles no-object targets
+                accuracy_iou[union_ == 0] += 1.0
+            intersection, union = intersection.cpu().numpy(), union.cpu().numpy()
+            accuracy_iou = accuracy_iou.cpu().numpy() / gt_masks.shape[0]
+            trackers["intersection"].update(intersection)
+            trackers["union"].update(union)
+            trackers["gIoU"].update(accuracy_iou, n=gt_masks.shape[0])
+        for meter in trackers.values():
+            meter.all_reduce()
+        iou_per_class = trackers["intersection"].sum / (trackers["union"].sum + 1e-10)
+        class_iou = iou_per_class[1]
+        global_iou = trackers["gIoU"].avg[1]
+        if args.local_rank == 0:
+            tensorboard_writer.add_scalar("val/giou", global_iou, current_epoch)
+            tensorboard_writer.add_scalar("val/ciou", class_iou, current_epoch)
+            print("giou: {:.4f}, ciou: {:.4f}".format(global_iou, class_iou))
+        return global_iou, class_iou
+    else:
+        # Initializing performance trackers
+        trackers = {"loss": AverageMeter("Loss", ":.4f"), "ce_loss": AverageMeter("CeLoss", ":.4f"),
+                    "mask_bce_loss": AverageMeter("MaskBCELoss", ":.4f"),
+                    "mask_dice_loss": AverageMeter("MaskDICELoss", ":.4f"),
+                    "mask_loss": AverageMeter("MaskLoss", ":.4f")}
+        # Prepare model for validation phase
+        # Hack to get the loss
+        training_model.train()
+        for data_batch in tqdm.tqdm(validation_loader):
+            # Prepare data and convert relevant tensors to bfloat16
+            data_batch = dict_to_cuda(data_batch)
+            for key in ["global_enc_images", "grounding_enc_images"]:
+                if data_batch[key] is not None:
+                    data_batch[key] = data_batch[key].bfloat16()
+            torch.cuda.empty_cache()
+            # Model inference without gradient tracking
+            with torch.no_grad():
+                predictions = training_model(**data_batch)
+            # Update performance metrics)
+            for key, tracker in trackers.items():
+                tracker.update(predictions[key].item(), data_batch["global_enc_images"].size(0))
+        # Synchronize metrics across processes
+        for tracker in trackers.values():
+            tracker.all_reduce()
+        # Calculate average validation loss
+        avg_val_loss = trackers["ce_loss"].avg
+        # Tensorboard logging for primary process
+        if args.local_rank == 0:
+            tensorboard_writer.add_scalar("val/loss", avg_val_loss, current_epoch)
+        return avg_val_loss
+if __name__ == "__main__":
+    args = parse_args(sys.argv[1:])
+    main(args)

lightning-hydra-template/.github/codecov.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+coverage:
+  status:
+    # measures overall project coverage
+    project:
+      default:
+        threshold: 100% # how much decrease in coverage is needed to not consider success
+    # measures PR or single commit coverage
+    patch:
+      default:
+        threshold: 100% # how much decrease in coverage is needed to not consider success
+    # project: off
+    # patch: off

lightning-hydra-template/.github/workflows/test.yml ADDED Viewed

	@@ -0,0 +1,139 @@

+name: Tests
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main, "release/*", "dev"]
+jobs:
+  run_tests_ubuntu:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["ubuntu-latest"]
+        python-version: ["3.8", "3.9", "3.10"]
+    timeout-minutes: 20
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pytest
+          pip install sh
+      - name: List dependencies
+        run: |
+          python -m pip list
+      - name: Run pytest
+        run: |
+          pytest -v
+  run_tests_macos:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["macos-latest"]
+        python-version: ["3.8", "3.9", "3.10"]
+    timeout-minutes: 20
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pytest
+          pip install sh
+      - name: List dependencies
+        run: |
+          python -m pip list
+      - name: Run pytest
+        run: |
+          pytest -v
+  run_tests_windows:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["windows-latest"]
+        python-version: ["3.8", "3.9", "3.10"]
+    timeout-minutes: 20
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pytest
+      - name: List dependencies
+        run: |
+          python -m pip list
+      - name: Run pytest
+        run: |
+          pytest -v
+  # upload code coverage report
+  code-coverage:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v2
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pytest
+          pip install pytest-cov[toml]
+          pip install sh
+      - name: Run tests and collect coverage
+        run: pytest --cov src # NEEDS TO BE UPDATED WHEN CHANGING THE NAME OF "src" FOLDER
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v3

lightning-hydra-template/configs/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # this file is needed here to include configs when building project as a package

lightning-hydra-template/configs/local/.gitkeep ADDED Viewed

File without changes

lightning-hydra-template/configs/train.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+# @package _global_
+# specify here default configuration
+# order of defaults determines the order in which configs override each other
+defaults:
+  - _self_
+  - data: mnist
+  - model: mnist
+  - callbacks: default
+  - logger: null # set logger here or use command line (e.g. `python train.py logger=tensorboard`)
+  - trainer: default
+  - paths: default
+  - extras: default
+  - hydra: default
+  # experiment configs allow for version control of specific hyperparameters
+  # e.g. best hyperparameters for given model and datamodule
+  - experiment: null
+  # config for hyperparameter optimization
+  - hparams_search: null
+  # optional local config for machine/user specific settings
+  # it's optional since it doesn't need to exist and is excluded from version control
+  - optional local: default
+  # debugging config (enable through command line, e.g. `python train.py debug=default)
+  - debug: null
+# task name, determines output directory path
+task_name: "train"
+# tags to help you identify your experiments
+# you can overwrite this in experiment configs
+# overwrite from command line with `python train.py tags="[first_tag, second_tag]"`
+tags: ["dev"]
+# set False to skip model training
+train: True
+# evaluate on test set, using best model weights achieved during training
+# lightning chooses best weights based on the metric specified in checkpoint callback
+test: True
+# simply provide checkpoint path to resume training
+ckpt_path: null
+# seed for random number generators in pytorch, numpy and python.random
+seed: null

lightning-hydra-template/logs/.gitkeep ADDED Viewed

File without changes

lightning-hydra-template/tests/test_datamodules.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from pathlib import Path
+import pytest
+import torch
+from src.data.mnist_datamodule import MNISTDataModule
+@pytest.mark.parametrize("batch_size", [32, 128])
+def test_mnist_datamodule(batch_size: int) -> None:
+    """Tests `MNISTDataModule` to verify that it can be downloaded correctly, that the necessary
+    attributes were created (e.g., the dataloader objects), and that dtypes and batch sizes
+    correctly match.
+    :param batch_size: Batch size of the data to be loaded by the dataloader.
+    """
+    data_dir = "data/"
+    dm = MNISTDataModule(data_dir=data_dir, batch_size=batch_size)
+    dm.prepare_data()
+    assert not dm.data_train and not dm.data_val and not dm.data_test
+    assert Path(data_dir, "MNIST").exists()
+    assert Path(data_dir, "MNIST", "raw").exists()
+    dm.setup()
+    assert dm.data_train and dm.data_val and dm.data_test
+    assert dm.train_dataloader() and dm.val_dataloader() and dm.test_dataloader()
+    num_datapoints = len(dm.data_train) + len(dm.data_val) + len(dm.data_test)
+    assert num_datapoints == 70_000
+    batch = next(iter(dm.train_dataloader()))
+    x, y = batch
+    assert len(x) == batch_size
+    assert len(y) == batch_size
+    assert x.dtype == torch.float32
+    assert y.dtype == torch.int64

lightning-hydra-template/tests/test_eval.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+from pathlib import Path
+import pytest
+from hydra.core.hydra_config import HydraConfig
+from omegaconf import DictConfig, open_dict
+from src.eval import evaluate
+from src.train import train
+@pytest.mark.slow
+def test_train_eval(tmp_path: Path, cfg_train: DictConfig, cfg_eval: DictConfig) -> None:
+    """Tests training and evaluation by training for 1 epoch with `train.py` then evaluating with
+    `eval.py`.
+    :param tmp_path: The temporary logging path.
+    :param cfg_train: A DictConfig containing a valid training configuration.
+    :param cfg_eval: A DictConfig containing a valid evaluation configuration.
+    """
+    assert str(tmp_path) == cfg_train.paths.output_dir == cfg_eval.paths.output_dir
+    with open_dict(cfg_train):
+        cfg_train.trainer.max_epochs = 1
+        cfg_train.test = True
+    HydraConfig().set_config(cfg_train)
+    train_metric_dict, _ = train(cfg_train)
+    assert "last.ckpt" in os.listdir(tmp_path / "checkpoints")
+    with open_dict(cfg_eval):
+        cfg_eval.ckpt_path = str(tmp_path / "checkpoints" / "last.ckpt")
+    HydraConfig().set_config(cfg_eval)
+    test_metric_dict, _ = evaluate(cfg_eval)
+    assert test_metric_dict["test/acc"] > 0.0
+    assert abs(train_metric_dict["test/acc"].item() - test_metric_dict["test/acc"].item()) < 0.001