tuandunghcmut commited on Apr 11, 2025

Commit

1c3d47d

verified ·

1 Parent(s): 637b6eb

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
VILA/.ipynb_checkpoints/Dockerfile-checkpoint +18 -0
VILA/.ipynb_checkpoints/README-checkpoint.md +341 -0
VILA/.ipynb_checkpoints/environment_setup-checkpoint.sh +33 -0
VILA/CIs/license_all.sh +1 -0
VILA/CIs/license_commited.sh +6 -0
VILA/data_prepare/.DS_Store +0 -0
VILA/data_prepare/LICENSE +8 -0
VILA/data_prepare/README.md +172 -0
VILA/data_prepare/panda70m.sh +34 -0
VILA/data_prepare/panda_split.py +117 -0
VILA/data_prepare/parallel_shards.sh +29 -0
VILA/demo_images/LongVILA-pipeline.png +3 -0
VILA/demo_images/av.png +3 -0
VILA/demo_images/demo_img_1.png +3 -0
VILA/demo_images/demo_img_2.png +3 -0
VILA/demo_images/demo_img_3.png +3 -0
VILA/demo_images/longvila-logo.png +3 -0
VILA/demo_images/vila-logo.jpg +0 -0
VILA/demo_trt_llm/README.md +3 -0
VILA/inference_test/inference_test.json +546 -0
VILA/inference_test/inference_test.py +153 -0
VILA/llava.egg-info/PKG-INFO +287 -0
VILA/llava.egg-info/SOURCES.txt +154 -0
VILA/llava.egg-info/dependency_links.txt +1 -0
VILA/llava.egg-info/requires.txt +37 -0
VILA/llava.egg-info/top_level.txt +7 -0
VILA/llava/.DS_Store +0 -0
VILA/llava/constants.py +31 -0
VILA/llava/conversation.py +489 -0
VILA/llava/entry.py +18 -0
VILA/llava/mm_utils.py +407 -0
VILA/llava/modals.py +26 -0
VILA/scripts/convert_gqa_for_eval.py +33 -0
VILA/scripts/convert_karpathy_to_anno.py +130 -0
VILA/scripts/convert_mmbench_for_submission.py +46 -0
VILA/scripts/convert_mmvet_for_eval.py +33 -0
VILA/scripts/convert_seed_for_submission.py +88 -0
VILA/scripts/convert_sqa_to_llava.py +104 -0
VILA/scripts/convert_sqa_to_llava_base_prompt.py +327 -0
VILA/scripts/convert_vizwiz_for_submission.py +60 -0
VILA/scripts/convert_vqav2_for_submission.py +65 -0
VILA/scripts/extract_mm_projector.py +57 -0
VILA/scripts/zero2.json +23 -0
VILA/scripts/zero3.json +28 -0
VILA/scripts/zero3_mics_mini_fixed.json +30 -0
VILA/scripts/zero3_mics_tiny_fixed.json +30 -0
VILA/scripts/zero3_offload.json +56 -0
VILA/scripts/zero3_offload_inference.json +21 -0
VILA/scripts/zero3pp.json +29 -0

.gitattributes CHANGED Viewed

@@ -370,3 +370,9 @@ groundingLMM/gradio-dev/demo/video_identity/video/video_sample.mp4 filter=lfs di
 groundingLMM/gradio-dev/demo/video_subtitle/files/a.mp4 filter=lfs diff=lfs merge=lfs -text
 groundingLMM/gradio-dev/demo/video_subtitle/files/b.mp4 filter=lfs diff=lfs merge=lfs -text
 groundingLMM/gradio-dev/demo/unispeech-speaker-verification/samples/kirsten_dunst.wav filter=lfs diff=lfs merge=lfs -text

 groundingLMM/gradio-dev/demo/video_subtitle/files/a.mp4 filter=lfs diff=lfs merge=lfs -text
 groundingLMM/gradio-dev/demo/video_subtitle/files/b.mp4 filter=lfs diff=lfs merge=lfs -text
 groundingLMM/gradio-dev/demo/unispeech-speaker-verification/samples/kirsten_dunst.wav filter=lfs diff=lfs merge=lfs -text
+VILA/demo_images/demo_img_3.png filter=lfs diff=lfs merge=lfs -text
+VILA/demo_images/LongVILA-pipeline.png filter=lfs diff=lfs merge=lfs -text
+VILA/demo_images/longvila-logo.png filter=lfs diff=lfs merge=lfs -text
+VILA/demo_images/demo_img_2.png filter=lfs diff=lfs merge=lfs -text
+VILA/demo_images/demo_img_1.png filter=lfs diff=lfs merge=lfs -text
+VILA/demo_images/av.png filter=lfs diff=lfs merge=lfs -text

VILA/.ipynb_checkpoints/Dockerfile-checkpoint ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM nvcr.io/nvidia/pytorch:24.06-py3
+WORKDIR /app
+RUN curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -o ~/miniconda.sh \
+    && sh ~/miniconda.sh -b -p /opt/conda \
+    && rm ~/miniconda.sh
+ENV PATH /opt/conda/bin:$PATH
+COPY pyproject.toml pyproject.toml
+COPY llava llava
+COPY environment_setup.sh environment_setup.sh
+RUN bash environment_setup.sh vila
+COPY server.py server.py
+CMD ["conda", "run", "-n", "vila", "--no-capture-output", "python", "-u", "-W", "ignore", "server.py"]

VILA/.ipynb_checkpoints/README-checkpoint.md ADDED Viewed

	@@ -0,0 +1,341 @@

+<p align="center">
+  <img src="demo_images/vila-logo.jpg" width="20%"/>
+</p>
+# VILA: On Pre-training for Visual Language Models
+[![Code License](https://img.shields.io/badge/Code%20License-Apache_2.0-green.svg)](CODE_LICENSE)
+[![Model License](https://img.shields.io/badge/MODEL%20License-CC%20By%20NC%204.0-red.svg)](MODEL_LICENSE)
+[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/)
+[VILA arxiv](https://arxiv.org/abs/2312.07533) / [VILA Demo](https://vila-demo.hanlab.ai/) / [VILA Huggingface](https://huggingface.co/collections/Efficient-Large-Model/vila-on-pre-training-for-visual-language-models-65d8022a3a52cd9bcd62698e)
+## 💡 Introduction
+VILA is a visual language model (VLM) pretrained with interleaved image-text data at scale, enabling **video understanding** and **multi-image understanding** capabilities. VILA is deployable on the edge by [AWQ](https://arxiv.org/pdf/2306.00978.pdf) 4bit quantization and [TinyChat](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat) framework. We find: (1) image-text pairs are not enough, interleaved image-text is essential; (2) unfreezing LLM during interleaved image-text pre-training enables in-context learning; (3)re-blending text-only instruction data is crucial to boost both VLM and text-only performance; (4) token compression extends #video frames. VILA unveils appealing capabilities, including: video reasoning, in-context learning, visual chain-of-thought, and better world knowledge.
+## 💡 News
+- [2024/08] We release [LongVILA](./LongVILA.md) that supports long video understanding (Captioning, QA, Needle-in-a-Haystack) up to 1024 frames.
+- [2024/07] VILA1.5 also ranks 1st place (OSS model) on [MLVU test leaderboard](https://github.com/JUNJIE99/MLVU).
+- [2024/06] VILA1.5 is now the best open sourced VLM on [MMMU leaderboard](https://mmmu-benchmark.github.io/#leaderboard) and [Video-MME](https://video-mme.github.io/home_page.html#leaderboard) leaderboard!
+- [2024/05] We release VILA-1.5, which offers **video understanding capability**. VILA-1.5 comes with four model sizes: 3B/8B/13B/40B.
+- [2024/05] We release [AWQ](https://arxiv.org/pdf/2306.00978.pdf)-quantized 4bit VILA-1.5 models. VILA-1.5 is efficiently deployable on diverse NVIDIA GPUs (A100, 4090, 4070 Laptop, Orin, Orin Nano) by [TinyChat](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat) and [TensorRT-LLM](demo_trt_llm) backends.
+- [2024/03] VILA has been accepted by CVPR 2024!
+- [2024/02] We release [AWQ](https://arxiv.org/pdf/2306.00978.pdf)-quantized 4bit VILA models, deployable on Jetson Orin and laptops through [TinyChat](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat) and [TinyChatEngine](https://github.com/mit-han-lab/TinyChatEngine).
+- [2024/02] VILA is released. We propose interleaved image-text pretraining that enables **multi-image** VLM. VILA comes with impressive in-context learning capabilities. We open source everything: including training code, evaluation code, datasets, model ckpts.
+- [2023/12] [Paper](https://arxiv.org/abs/2312.07533) is on Arxiv!
+## Performance
+### Image QA Benchmarks
+| $~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~$ | Prec. | VQAv2 | GQA  | VizWiz | SQA-I | VQA-T | POPE | MME     | MMB  | MMB-CN | SEED | SEED-I | MMMU (val) | MMMU (test) | llava-bench | MM-Vet | Average |
+| -------------------------------- | ----- | ----- | ---- | ------ | ----- | ----- | ---- | ------- | ---- | ------ | ---- | ------ | ---------- | ----------- | ----------- | ------ | ------- |
+| VILA1.5-3B                       | fp16  | 80.4  | 61.5 | 53.5   | 69.0  | 60.4  | 85.9 | 1442.44 | 63.4 | 52.7   | 60.9 | 67.9   | 33.3       | 30.8        | 75.9        | 35.4   | 60.2    |
+| VILA1.5-3B-AWQ                   | int4  | 80.0  | 61.1 | 53.8   | 67.8  | 60.4  | 85.9 | 1437.34 | 63.3 | 51.4   | 59.8 | 66.6   | 32.7       | 31.1        | 75.0        | 37.3   | 59.9    |
+| VILA1.5-3B-S2                    | fp16  | 79.8  | 61.4 | 61.3   | 69.6  | 63.4  | 85.3 | 1431.65 | 62.8 | 52.2   | 60.0 | 66.4   | 32.8       | 31.3        | 76.7        | 38.6   | 60.9    |
+| VILA1.5-3B-S2-AWQ                | int4  | 79.4  | 61.3 | 62.3   | 69.2  | 63.0  | 85.8 | 1417.06 | 61.6 | 51.5   | 59.1 | 65.7   | 33.4       | 30.4        | 77.1        | 36.7   | 60.5    |
+| Llama-3-VILA1.5-8B               | fp16  | 83.0  | 63.5 | 63.2   | 82.0  | 68.5  | 85.6 | 1634.91 | 75.3 | 69.9   | 66.4 | 73.8   | 38.6       | 32.7        | 71.9        | 43.2   | 66.6    |
+| Llama-3-VILA1.5-8B-AWQ           | int4  | 80.3  | 61.7 | 59.3   | 79.0  | 65.4  | 82.9 | 1593.65 | 71.0 | 64.9   | 64.0 | 71.1   | 36.0       | 36.1        | 79.0        | 37.2   | 64.5    |
+| VILA1.5-13B                      | fp16  | 82.8  | 64.3 | 62.6   | 80.1  | 65.0  | 86.3 | 1569.55 | 74.9 | 66.3   | 65.1 | 72.6   | 37.9       | 33.6        | 80.8        | 44.3   | 66.3    |
+| VILA1.5-13B-AWQ                  | int4  | 82.7  | 64.5 | 63.3   | 79.7  | 64.7  | 86.7 | 1531.35 | 74.7 | 66.7   | 65.1 | 72.6   | 37.8       | 34.0        | 81.9        | 46.4   | 66.5    |
+| VILA1.5-40B                      | fp16  | 84.3  | 64.6 | 62.2   | 87.2  | 73.6  | 87.3 | 1726.82 | 82.4 | 80.2   | 69.1 | 75.8   | 51.9       | 46.9        | 81.3        | 53.0   | 72.4    |
+| VILA1.5-40B-AWQ                  | int4  | 84.1  | 64.4 | 61.3   | 86.7  | 73.2  | 88.2 | 1714.79 | 83.2 | 79.6   | 68.9 | 75.6   | 49.3       | 46.2        | 83.0        | 51.4   | 72.1    |
+<sup>NOTE: VQAV2 and VizWiz are test-dev, the average accuracy is calculated over all datasets and MME numbers are divided by 20.</sup>
+### Video QA Benchmarks
+| $~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~$ | Prec. | Perception Test | ActivityNet  | MSVD | MSRVTT | TGIF | EgoSchema (test) | CinePile
+| -------------------------------- | ----- | ----- | ---- | ------ | ----- | ----- | ----- | ----- |
+| VILA1.5-3B                     | fp16  | 47  | 50.2 | 76.6  | 57.5  | 51.7  | 42.6 | 37.9
+| VILA1.5-3B-S2                  | fp16  | 49.7  | 50.7 | 76.9  | 57.6 | 51.7 |
+| Llama-3-VILA1.5-8B               | fp16  | 54.1  | 54.3 | 78.3   | 60.1  | 54.1 | 50.4 | 48.7
+| VILA1.5-13B                      | fp16  | 53.6  | 54.7 | 77.9   | 60.2  | 56  | 52.2 | 50.1
+| VILA1.5-40B                      | fp16  | 54  | 58 | 80.1  | 63 | 58.2 | 58.7 | 51.3
+### Inference speed ( Token/sec )
+| $~~~~~~$               | Precision | A100  | 4090  | Orin |
+| ---------------------- | --------- | ----- | ----- | ---- |
+| VILA1.5-3B             | fp16      | 104.6 | 137.6 | 25.4 |
+| VILA1.5-3B-AWQ         | int4      | 182.8 | 215.5 | 42.5 |
+| VILA1.5-3B-S2          | fp16      | 104.3 | 137.2 | 24.6 |
+| VILA1.5-3B-S2-AWQ      | int4      | 180.2 | 219.3 | 40.1 |
+| Llama-3-VILA1.5-8B     | fp16      | 74.9  | 57.4  | 10.2 |
+| Llama-3-VILA1.5-8B-AWQ | int4      | 168.9 | 150.2 | 28.7 |
+| VILA1.5-13B            | fp16      | 50.9  | OOM   | 6.1  |
+| VILA1.5-13B-AWQ        | int4      | 115.9 | 105.7 | 20.6 |
+| VILA1.5-40B            | fp16      | OOM   | OOM   | --   |
+| VILA1.5-40B-AWQ        | int4      | 57.0  | OOM   | --   |
+<sup>NOTE: Measured using the [TinyChat](https://github.com/mit-han-lab/llm-awq/tinychat) backend at batch size = 1.</sup>
+## VILA Examples
+### Video captioning
+https://github.com/Efficient-Large-Model/VILA/assets/156256291/c9520943-2478-4f97-bc95-121d625018a6
+Prompt: Elaborate on the visual and narrative elements of the video in detail.
+Caption: The video shows a person's hands working on a white surface. They are folding a piece of fabric with a checkered pattern in shades of blue and white. The fabric is being folded into a smaller, more compact shape. The person's fingernails are painted red, and they are wearing a black and red garment. There are also a ruler and a pencil on the surface, suggesting that measurements and precision are involved in the process.
+### In context learning
+<img src="demo_images/demo_img_1.png" height="239">
+<img src="demo_images/demo_img_2.png" height="250">
+### Multi-image reasoning
+<img src="demo_images/demo_img_3.png" height="193">
+### VILA on Jetson Orin
+https://github.com/Efficient-Large-Model/VILA/assets/7783214/6079374c-0787-4bc4-b9c6-e1524b4c9dc4
+### VILA on RTX 4090
+https://github.com/Efficient-Large-Model/VILA/assets/7783214/80c47742-e873-4080-ad7d-d17c4700539f
+</details>
+## Installation
+```bash
+./environment_setup.sh vila
+```
+## Training
+VILA training contains three steps, for specific hyperparameters, please check out the [scripts/v1_5](scripts/v1_5) folder:
+### Step-1: Alignment
+We utilize LLaVA-CC3M-Pretrain-595K dataset to align the textual and visual modalities.
+The stage 1 script takes in two parameters and it can run on a single 8xA100 node. `BASE_MODEL_PATH` points to a online or local huggingface repository, such as `NousResearch/Llama-2-7b-hf`. `OUTPUT_NAME` points to a target directory under `checkpoints`, which will save the trained multimodal projector afterwards.
+```bash
+bash scripts/v1_5/paper/1_mm_align.sh [BASE_MODEL_PATH] [OUTPUT_NAME]
+```
+### Step-2: Pretraining
+We use MMC4 and Coyo dataset to train VLM with interleaved image-text pairs.
+```bash
+bash scripts/v1_5/paper/2_pretrain_mmc4_coyo.sh [CODE_PATH] [BASE_MODEL_PATH] [STAGE1_PATH] [OUTPUT_NAME]
+```
+The stage 2 script takes in four arguments. `CODE_PATH` is the absolute path to our VILA codebase, `BASE_MODEL_PATH` has similar meaning to what is presented in the stage 1 script. `STAGE1_PATH` points to the `OUTPUT_NAME` of stage 1 (i.e. where the stage 1 checkpoint is stored). `OUTPUT_NAME` is the desired folder name under `checkpoints` that saves the pretraining checkpoint. The script we provided for this stage is executed on slurm, and we expect it to execute on 16 nodes (128 GPUs).
+### Step-3: Supervised fine-tuning
+This is the last stage of VILA training, in which we tune the model to follow multimodal instructions on a subset of M3IT, FLAN and ShareGPT4V. This stage runs on a 8xA100 node.
+```bash
+bash scripts/v1_5/paper/3_sft.sh [STAGE2_PATH] [OUTPUT_NAME]
+```
+The stage 3 script takes in two arguments. `STAGE2_PATH` points to the `OUTPUT_NAME` of the stage 2 script (i.e. where the stage 2 checkpoint is stored). `OUTPUT_NAME` is the desired folder name under `checkpoints` that stores the final checkpoint.
+## Evaluations
+### Image Benchmarks
+You can follow [Llava1.5 eval](https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md) to download all datasets. After downloading all datasets, please put them under `playground/data/eval`.
+Please make the following changes to the MME evaluation script. Please search for:
+```python
+data_path = "MME_Benchmark_release_version"
+```
+and replace it with:
+```python
+data_path = os.path.join(script_dir, "MME_Benchmark_release_version")
+```
+We provide a push-the-button script to perform evaluation on all 10 datasets that do not require GPT-assisted evaluation:
+```bash
+./scripts/v1_5/eval/eval_all.sh [CHECKPOINT_PATH] [MODEL_NAME] [CONV_MODE]
+```
+This script takes in two parameters, `CHECKPOINT_PATH` points to the stage 3 model checkpoint, and `MODEL_NAME` will be the name of evaluation results.
+[VQAv2](https://eval.ai/web/challenges/challenge-page/830/my-submission) and [Vizwiz](https://eval.ai/web/challenges/challenge-page/2185/my-submission) evaluations are hosted on eval.ai. You need to register an account and create a team to be able to submit eval.
+MMBench and MMBench_CN eval are hosted on another [evaluation server](https://opencompass.org.cn/leaderboard-multimodal). Make sure you change the name of the file before submitting, otherwise the server caches results and will always return wrong result to you.
+We provide a quick script to automatically organize the prediction files that need to be submitted to servers:
+```bash
+python scripts/v1_5/eval/copy_predictions.py [MODEL_NAME]
+```
+You will be able to find the predictions under `playground/data/predictions_upload/[MODEL_NAME]` after executing this script.
+### Video Benchmarks
+Please follow the evaluation steps in [Video-LLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA/blob/main/TRAIN_AND_VALIDATE.md#data-for-validating) for dataset preparation.
+```bash
+./scripts/v1_5/eval/video_chatgpt/run_all.sh [CHECKPOINT_PATH] [MODEL_NAME] [CONV_MODE]
+./scripts/v1_5/eval/video_chatgpt/eval_all.sh [MODEL_NAME]
+```
+## Inference
+We provide snippets for quick inference with user prompts and images.
+Llama-3-VILA1.5-8B inference:
+```bash
+python -W ignore llava/eval/run_vila.py \
+    --model-path Efficient-Large-Model/Llama-3-VILA1.5-8b-Fix \
+    --conv-mode llama_3 \
+    --query "<image>\n Please describe the traffic condition." \
+    --image-file "av.png"
+```
+VILA1.5-40B inference:
+```bash
+python -W ignore llava/eval/run_vila.py \
+    --model-path Efficient-Large-Model/VILA1.5-40b \
+    --conv-mode hermes-2 \
+    --query "<image>\n Please describe the traffic condition." \
+    --image-file "av.png"
+```
+VILA1.5-3B video inference:
+```bash
+python -W ignore llava/eval/run_vila.py \
+    --model-path Efficient-Large-Model/VILA1.5-3b \
+    --conv-mode vicuna_v1 \
+    --query "<video>\n Please describe this video." \
+    --video-file "demo.mp4"
+```
+## Quantization and Deployment
+Our VILA models are quantized by [AWQ](https://arxiv.org/abs/2306.00978) into 4 bits for efficient inference on the edge. We provide a push-the-button [script](https://github.com/mit-han-lab/llm-awq/blob/main/scripts/vila_example.sh) to quantize VILA with AWQ.
+### Running VILA on desktop GPUs and edge GPUs
+We support AWQ-quantized 4bit VILA on GPU platforms via [TinyChat](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat). We provide a [tutorial](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat#support-vlm-models-vila--llava) to run the model with TinyChat after quantization. We also provide an [instruction](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat/serve) to launch a Gradio server (powered by TinyChat and AWQ) to serve 4-bit quantized VILA models.
+### Running VILA on laptops
+We further support our AWQ-quantized 4bit VILA models on various CPU platforms with both x86 and ARM architectures with our [TinyChatEngine](https://github.com/mit-han-lab/TinyChatEngine). We also provide a detailed [tutorial](https://github.com/mit-han-lab/TinyChatEngine/tree/main?tab=readme-ov-file#deploy-vision-language-model-vlm-chatbot-with-tinychatengine) to help the users deploy VILA on different CPUs.
+### Running VILA API server
+A simple API server has been provided to serve VILA models. The server is built on top of [FastAPI](https://fastapi.tiangolo.com/) and [Huggingface Transformers](https://huggingface.co/transformers/). The server can be run with the following command:
+#### With CLI
+```bash
+python -W ignore server.py \
+    --port 8000 \
+    --model-path Efficient-Large-Model/VILA1.5-3B \
+    --conv-mode vicuna_v1
+```
+#### With Docker
+```bash
+docker build -t vila-server:latest .
+docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
+    -v ./hub:/root/.cache/huggingface/hub \
+    -it --rm -p 8000:8000 \
+    -e VILA_MODEL_PATH=Efficient-Large-Model/VILA1.5-3B \
+    -e VILA_CONV_MODE=vicuna_v1 \
+    vila-server:latest
+```
+Then you can call the endpoint with the OpenAI SDK as follows:
+```python
+from openai import OpenAI
+client = OpenAI(
+    base_url="http://localhost:8000",
+    api_key="fake-key",
+)
+response = client.chat.completions.create(
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What’s in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://blog.logomyway.com/wp-content/uploads/2022/01/NVIDIA-logo.jpg",
+                        # Or you can pass in a base64 encoded image
+                        # "url": "data:image/png;base64,<base64_encoded_image>",
+                    },
+                },
+            ],
+        }
+    ],
+    max_tokens=300,
+    model="VILA1.5-3B",
+    # You can pass in extra parameters as follows
+    extra_body={"num_beams": 1, "use_cache": False},
+)
+print(response.choices[0].message.content)
+```
+<sup>NOTE: This API server is intended for evaluation purposes only and has not been optimized for production use. It has only been tested on A100 and H100 GPUs.</sup>
+## Checkpoints
+We release [VILA1.5-3B](https://hf.co/Efficient-Large-Model/VILA1.5-3b), [VILA1.5-3B-S2](https://hf.co/Efficient-Large-Model/VILA1.5-3b-s2), [Llama-3-VILA1.5-8B](https://hf.co/Efficient-Large-Model/Llama-3-VILA1.5-8B-Fix), [VILA1.5-13B](https://hf.co/Efficient-Large-Model/VILA1.5-13b), [VILA1.5-40B](https://hf.co/Efficient-Large-Model/VILA1.5-40b) and the 4-bit [AWQ](https://arxiv.org/abs/2306.00978)-quantized models [VILA1.5-3B-AWQ](https://hf.co/Efficient-Large-Model/VILA1.5-3b-AWQ), [VILA1.5-3B-S2-AWQ](https://hf.co/Efficient-Large-Model/VILA1.5-3b-s2-AWQ), [Llama-3-VILA1.5-8B-AWQ](https://hf.co/Efficient-Large-Model/Llama-3-VILA1.5-8B-Fix-AWQ), [VILA1.5-13B-AWQ](https://hf.co/Efficient-Large-Model/VILA1.5-13b-AWQ), [VILA1.5-40B-AWQ](https://hf.co/Efficient-Large-Model/VILA1.5-40b-AWQ).
+## 🔒 License
+- The code is released under the Apache 2.0 license as found in the [LICENSE](./LICENSE) file.
+- The pretrained weights are released under the [CC-BY-NC-SA-4.0 license](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en).
+- The service is a research preview intended for non-commercial use only, and is subject to the following licenses and terms:
+  - [Model License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA. For LLAMA3-VILA checkpoints terms of use, please refer to the [LLAMA3 License](https://llama.meta.com/llama3/license/) for additional details.
+  - [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI
+  - [Dataset Licenses](./data_prepare/LICENSE) for each one used during training.
+## Team
+| | | |
+| --- | --- | ---|
+[\*Yao Lu](https://scholar.google.com/citations?user=OI7zFmwAAAAJ&hl=en): Nvidia|  [\*Hongxu Yin](https://hongxu-yin.github.io/): Nvidia |  [\*Ji Lin](https://www.linji.me/): OpenAI (work done at Nvidia and MIT)
+[Wei Ping](https://scholar.google.com/citations?user=6gKEYRgAAAAJ&hl=en): Nvidia |   [Pavlo Molchanov](https://www.pmolchanov.com/): Nvidia |  [Andrew Tao](https://scholar.google.com/citations?user=Wel9l1wAAAAJ&hl=en): Nvidia |
+[Haotian Tang](http://kentang.net/): MIT |  [Shang Yang](https://ys-2020.github.io/): MIT |  [Ligeng Zhu](https://lzhu.me/): Nvidia, MIT |
+[Wei-Chen Wang](https://weichenwang.me/): MIT |  [Fuzhao Xue](https://xuefuzhao.github.io/): Nvidia, NUS |  [Yunhao Fang](https://seerkfang.github.io/): Nvidia, UCSD |
+[Yukang Chen](https://yukangchen.com/): Nvidia, CUHK | [Zhuoyang Zhang](https://openreview.net/profile?id=~Zhuoyang_Zhang1): Nvidia, Tsinghua Univ. | [Yue Shen](https://www.linkedin.com/in/yue-james-shen/): Nvidia |
+[Wei-Ming Chen](https://scholar.google.com/citations?user=6xFvyJwAAAAJ&hl=en): Nvidia |  [Huizi Mao](https://scholar.google.com/citations?user=r5WezOYAAAAJ&hl=zh-CN): Nvidia | [Baifeng Shi](https://bfshi.github.io/): Nvidia, UC Berkeley |
+[Jan Kautz](https://jankautz.com/): Nvidia | [Mohammad Shoeybi](https://scholar.google.com/citations?user=62ElavIAAAAJ&hl=en): Nvidia | [Song Han](http://songhan.mit.edu/): Nvidia, MIT
+## Citations
+```
+@misc{lin2023vila,
+      title={VILA: On Pre-training for Visual Language Models},
+      author={Ji Lin and Hongxu Yin and Wei Ping and Yao Lu and Pavlo Molchanov and Andrew Tao and Huizi Mao and Jan Kautz and Mohammad Shoeybi and Song Han},
+      year={2023},
+      eprint={2312.07533},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+# Acknowledgement
+- [LLaVA](https://github.com/haotian-liu/LLaVA): the codebase we built upon. Thanks for their wonderful work.
+- [InternVL](https://github.com/OpenGVLab/InternVL): for open-sourcing InternViT (used in VILA1.5-40b) and the [InternVL-SFT](https://github.com/OpenGVLab/InternVL/tree/main/internvl_chat#prepare-training-datasets) data blend (inspired by LLaVA-1.6) used in all VILA1.5 models.
+- [Vicuna](https://github.com/lm-sys/FastChat): the amazing open-sourced large language model!
+- [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT): we borrowed video evaluation script from this repository.
+- [MMC4](https://github.com/allenai/mmc4), [COYO-700M](https://github.com/kakaobrain/coyo-dataset), [M3IT](https://huggingface.co/datasets/MMInstruction/M3IT), [OpenORCA/FLAN](https://huggingface.co/datasets/Open-Orca/FLAN), [ShareGPT4V](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4V), [WIT](google-research-datasets/wit), [GSM8K-ScRel](https://github.com/OFA-Sys/gsm8k-ScRel/blob/main/data/train_use.jsonl), [VisualGenome](https://visualgenome.org/api/v0/api_home.html), [VCR](https://visualcommonsense.com/download/), [ScienceQA](https://huggingface.co/datasets/derek-thomas/ScienceQA), [Shot2Story](https://github.com/bytedance/Shot2Story/blob/master/DATA.md), [Youcook2](http://youcook2.eecs.umich.edu/), [Vatex](https://eric-xw.github.io/vatex-website/download.html), [ShareGPT-Video](https://huggingface.co/datasets/ShareGPTVideo/train_video_and_instruction) for providing datasets used in this research.

VILA/.ipynb_checkpoints/environment_setup-checkpoint.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/usr/bin/env bash
+# This is required to activate conda environment
+eval "$(conda shell.bash hook)"
+# CONDA_ENV=${1:-""}
+CONDA_ENV=vila
+if [ -n "$CONDA_ENV" ]; then
+    conda create -n $CONDA_ENV python=3.10 -y
+    conda activate $CONDA_ENV
+else
+    echo "Skipping conda environment creation. Make sure you have the correct environment activated."
+fi
+# This is required to enable PEP 660 support
+pip install --upgrade pip
+# This is optional if you prefer to use built-in nvcc
+conda install -c nvidia cuda-toolkit -y
+# Install FlashAttention2
+pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+# Install VILA
+pip install -e .
+pip install -e ".[train]"
+pip install -e ".[eval]"
+# Install HF's Transformers
+pip install git+https://github.com/huggingface/transformers@v4.37.2
+site_pkg_path=$(python -c 'import site; print(site.getsitepackages()[0])')
+cp -rv ./llava/train/transformers_replace/* $site_pkg_path/transformers/
+cp -rv ./llava/train/deepspeed_replace/* $site_pkg_path/deepspeed/

VILA/CIs/license_all.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ addlicense -s -c 'NVIDIA CORPORATION & AFFILIATES' -ignore "llava/eval/" -ignore "/__init__.py" /.py

VILA/CIs/license_commited.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+PYFILES=$(git diff --name-only --diff-filter=ACMRT $commithash HEAD | grep .py | xargs)
+for file in $PYFILES; do
+    echo $file
+    addlicense -s -c 'NVIDIA CORPORATION & AFFILIATES' $file
+done

VILA/data_prepare/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

VILA/data_prepare/LICENSE ADDED Viewed

	@@ -0,0 +1,8 @@

+License information for datasets used during VILA Training
+* LLaVA-1.5 Instruction Data: Apache 2.0
+* Coyo: cc-by-4.0
+* MMC4: ODC-By
+* FLAN: cc-by-4.0
+* M3IT: cc-by-4.0
+* ShareGPT4V: cc-by-nc-4.0

VILA/data_prepare/README.md ADDED Viewed

	@@ -0,0 +1,172 @@

+# Data Preparation for Training VILA
+To train VILA, we used the following datasets:
+| Stage                   | Datasets                                                                         |
+| ----------------------- | -------------------------------------------------------------------------------- |
+| 1. Initialize projector | CC3M                                                                             |
+| 2. Pre-training         | MMC4-core, COYO-700M subset                                                      |
+| 3. SFT                  | LLaVA-1.5, VFLAN, ShareGPT, TextFLAN, WIT, GSM8K-ScRel-SFT, Sherlock, ScienceQA |
+### LLaVa-CC3M-Pretrain
+We use [LLaVA-CC3M-Pretrain-595K](https://huggingface.co/datasets/liuhaotian/LLaVA-CC3M-Pretrain-595K/blob/main/chat.json) to train the visual language projector
+### MMC4-Core Dataset
+Due to the limit of compute, we pre-train VILA on the smaller core set of MMC4 instead of the full set.
+1. Firstly, download the annotations of the MMC4-core dataset here: https://github.com/allenai/mmc4. We used the non-fewer-face split, and you may need to request the access [here](https://forms.gle/VYtcNY8aYaUANK9f8).
+1. Now modify the input and output path in `mmc4_downloader.py` and run the following script to scrawl the MMC4 images:
+```bash
+cd mmc4
+python mmc4_downloader.py
+```
+Note that due to the expiration of image urls, you may end up getting a subset of the entire corpus.
+The scrawling may take a long time. Optionally, you can also shard the workload over multiple jobs/machines concurrently to speed up the process:
+```bash
+# provide the start and end index of the jsonl shard. There are 23098 - 14 shards totally
+# python mmc4_downloader.py <start_idx> <end_idx>
+python mmc4_downloader.py 0 1000  # worker 1
+python mmc4_downloader.py 1000 2000  # worker 2
+```
+3. Filter out invalid samples in MMC4:
+```bash
+python mmc4_filter_and_counter.py
+```
+4. Merge images and text into a unified pickle file for each shard:
+```bash
+python mmc4_merger.py
+```
+### COYO-700M Dataset
+1. Download the metadata of COYO-700M:
+```bash
+huggingface-cli download kakaobrain/coyo-700m --repo-type dataset --local-dir coyo-700m --local-dir-use-symlinks False
+```
+2. Scrawl the COYO images. Note that here we only keep a 20% subset in each shard with the highest CLIP similarity, to balance compute budget and data quality.
+There are totally 128 shards of annotations. Now download each one with the script:
+```bash
+cd coyo
+for SHARD in {0..127}; do
+    python coyo_downloader.py $SHARD
+done
+```
+3. Split downloaded COYO data into multiple shards:
+```bash
+python coyo_splitter.py
+```
+### LLaVA-1.5 Instruction Data
+We use this [file](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_v1_5_mix665k.json) in our experiments. Please download this dataset from LLaVA authors.
+```bash
+huggingface-cli download liuhaotian/LLaVA-Instruct-150K llava_v1_5_mix665k.json --repo-type dataset
+```
+### VFlan dataset
+1. Download FLAN datasets:
+```bash
+huggingface-cli download Open-Orca/FLAN --repo-type dataset --local-dir FLAN --local-dir-use-symlinks False
+```
+2. Preprocess FLAN dataset (sample 1M data from 378M samples):
+```bash
+cd sft
+python preprocess_flan.py
+```
+### M3IT Dataset
+1. Download M3IT datasets:
+```bash
+huggingface-cli download MMInstruction/M3IT --repo-type dataset --local-dir M3IT --local-dir-use-symlinks False
+```
+2. Preprocess M3IT dataset:
+```bash
+python preprocess_m3it.py
+```
+3. (Optional) Split FLAN+M3IT into multiple chunks to reduce CPU memory pressure during training:
+```bash
+python split_vflan.py
+```
+### ShareGPT4v
+The ShareGPT data can be obtained [mit-han-lab/ShareGPT4V](https://huggingface.co/datasets/mit-han-lab/ShareGPT4V). * Note the original ShareGPT4v dataset contains some samples with file ids (sa_XXXX) and repeative response. We filter those bad examples and reduced the samples from 100K -> 96K (for caption) and 1.2m -> 1.17m (for pretraining). Then we re-combine them into a single file.
+```bash
+huggingface-cli download mit-han-lab/ShareGPT4V --repo-type dataset --local-dir coyo-700m --local-dir-use-symlinks False
+```
+### WIT
+The original WIT data can be obtained [google-research-datasets/wit](https://github.com/google-research-datasets/wit/tree/main). * We subsample ~538K english data from the original WIT dataset and curate a llava conversation format JSON file.
+```bash
+huggingface-cli download Efficient-Large-Model/WIT_538K --repo-type dataset --local-dir WIT --local-dir-use-symlinks False
+```
+### GSM8K-ScRel-SFT
+We add some math data [gsm8k-ScRel](https://github.com/OFA-Sys/gsm8k-ScRel/blob/main/data/train_use.jsonl) to our SFT stage.
+### Sherlock
+The image files of Sherlock can be obtained from [VisualGenome](https://visualgenome.org/api/v0/api_home.html) and [VCR](https://visualcommonsense.com/download/) separately. The llava conversation format JSON file can be downloaded with
+```bash
+huggingface-cli download Efficient-Large-Model/sherlock_317K --repo-type dataset --local-dir sherlock --local-dir-use-symlinks False
+```
+### ScienceQA
+We use the train split of ScienceQA. The image data of the train split can be obtained from [ScienceQA](https://huggingface.co/datasets/derek-thomas/ScienceQA) or their [huggingface repo](https://huggingface.co/datasets/derek-thomas/ScienceQA). The llava conversation format JSON file can be downloaded with
+```bash
+huggingface-cli download Efficient-Large-Model/ScienceQA_train_12K --repo-type dataset --local-dir scienceqa --local-dir-use-symlinks False
+```
+### IDEFICS2-SFT dataset
+We also provide scripts to preprocess IDEFICS2-SFT dataset into llava-SFT like format.
+Please first download [HuggingFaceM4/the_cauldron](https://huggingface.co/datasets/HuggingFaceM4/the_cauldron) to `/home/jasonlu/workspace/idefics2-sft/the_cauldron`. Then, run the following scripts:
+```bash
+python preprocess_idefics2.py
+python merge_idefics2.py
+```
+A sample in the preprocessed dataset file will look like this:
+```json
+{"id": 0, "images": ["images/chart2text/0_0.png"], "conversations": [{"from": "human", "value": "<image>\nPlease clarify the meaning conveyed by this graph."}, {"from": "gpt", "value": "This statistic presents the reach of the most popular social networks among female beauty consumers in the United States as of August 2016. During the survey period, 62 percent of respondents had an Instagram account."}]}
+```
+Haotian's Note: Datasets overlapping with VFLAN / ShareGPT4V-SFT are removed. I also remove `plotqa` since it is too large, `localized_narratives` seems to be a little bit overlapped with captioning efforts within VILA. `websight` and `datikz` are two datasets that target code generation. Since the output is very long, and including them might slow down training, I also temporarily removed these two datasets, but feel free to add them back.

VILA/data_prepare/panda70m.sh ADDED Viewed

	@@ -0,0 +1,34 @@

+JOBS_LIMIT=${1:-32}  # Set your limit here
+workdir=${2:-/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/panda70m/panda70m_training_10m}
+wname=$(echo $workdir | rev | cut -d "/" -f 1 | rev)
+echo "Parallely checking for all shards in $workdir / $wname"
+parallel_size=32
+idx_size=$(( parallel_size - 1 ))
+mkdir -p slurm-logs/data
+for idx in $(seq 0 $idx_size); do
+    while [ $(jobs -rp | wc -l) -ge $JOBS_LIMIT ]; do
+        sleep 1
+    done
+    echo "Running jobs $(jobs -rp | wc -l) $wname-$idx-of-$parallel_size";
+    srun -A llmservice_nlp_fm \
+        -p cpu,cpu_1,cpu_long -t 4:00:00 -J cleanup-$wname-$idx-of-$parallel_size \
+        --cpus-per-task 8 \
+        --mem-per-cpu 8G \
+        -e slurm-logs/data/$idx-of-$parallel_size.err \
+        -o slurm-logs/data/$idx-of-$parallel_size.txt \
+        python llava/data/dataset_impl/panda70m.py --workdir=$workdir --shards=$idx --total=$parallel_size &
+done
+# bash data_prepare/panda70m.sh 32 /lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/panda70m/panda70m_training_10m;
+# bash data_prepare/panda70m.sh 32 /lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/panda70m/panda70m_training_2m;
+# bash data_prepare/panda70m.sh 32 /lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/panda70m/panda70m_testing;
+# --exclusive \
+# --cpus-per-task 8 \
+# --mem-per-cpu 8G \

VILA/data_prepare/panda_split.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import base64
+import copy
+import glob
+import io
+import json
+import logging
+import os
+import os.path as osp
+import pathlib
+import pickle
+import random
+import re
+import shutil
+import time
+from collections import defaultdict
+from dataclasses import dataclass, field
+from datetime import datetime
+from functools import lru_cache
+from io import BytesIO
+from typing import Dict, List, Optional, Sequence
+import cv2
+import decord
+import numpy as np
+import PIL
+import torch
+import transformers
+from decord._ffi.base import DECORDError
+from iopath.common.file_io import g_pathmgr
+from PIL import Image
+from pytorchvideo.data.decoder import DecoderType
+from pytorchvideo.data.encoded_video import EncodedVideo, select_video_class
+from pytorchvideo.data.video import Video
+from torch.utils.data import ConcatDataset, Dataset
+from torchvision.transforms import Resize
+import llava.data.datasets_mixture as datasets_mixture
+from llava import conversation as conversation_lib
+from llava.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    IGNORE_INDEX,
+    IMAGE_TOKEN_INDEX,
+)
+from llava.data.dataset import LazySupervisedDataset
+from llava.data.dataset_impl.textocr import GenericDataset, preprocess_OCR
+from llava.data.datasets_mixture import DATASETS
+from llava.data.simple_vila_webdataset import VILAWebDataset
+from llava.data.utils import VILAEncodedVideo
+from llava.mm_utils import is_gemma_tokenizer, tokenizer_image_token
+from llava.train.args import DataArguments, TrainingArguments
+DEFAULT_HIERTEXT = "/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/panda70m"
+SPLIT = "panda70m_testing"
+def with_opencv(filename):
+    video = cv2.VideoCapture(filename)
+    fps = video.get(cv2.CAP_PROP_FPS)
+    frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    duration = frame_count / fps
+    return duration, fps, frame_count
+def split_video_to_clips(
+    workdir=osp.expanduser("~/nvr_elm_llm/dataset/panda70m/panda70m_training_2m"),
+    shards=0,
+    total=-1,
+):
+    video_list = glob.glob(f"{workdir}/*.mp4")
+    video_list = sorted(video_list)
+    if total > 0:
+        chunk = len(video_list) // total
+        begin_idx = shards * chunk
+        end_idx = (shards + 1) * chunk
+        if shards == total - 1:
+            end_idx = len(video_list)
+        video_list = video_list[begin_idx:end_idx]
+    print(f"Splitting total {len(video_list)} videos")
+    output_dir = workdir + "_clip"
+    debug_info = {}
+    for idx, video_path in enumerate(video_list):
+        print(f"[{idx}/{len(video_list)}]", video_path)
+        json_path = video_path.replace(".mp4", ".json")
+        assert osp.exists(json_path) and osp.exists(video_path)
+        jinfo = json.load(open(json_path))
+        print(jinfo)
+        info = with_opencv(video_path)
+        print(info)
+        video = VILAEncodedVideo.from_bytesio(video_path, decoder="decord", decode_audio=False)
+        return
+if __name__ == "__main__":
+    # WORKDIR=osp.expanduser("~/nvr_elm_llm/dataset/panda70m/panda70m_testing")
+    # cleanup_corrupted_videos()
+    import fire
+    fire.Fire(split_video_to_clips)

VILA/data_prepare/parallel_shards.sh ADDED Viewed

	@@ -0,0 +1,29 @@

+JOBS_LIMIT=${1:-32}  # Set your limit here
+workdir=${2:-/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/panda70m/panda70m_training_10m}
+workdir=/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/video_datasets_v2/internvid/video_data_tar
+parallel_size=32
+idx_size=$(( parallel_size - 1 ))
+mkdir -p slurm-logs/data
+for idx in $(seq 0 $idx_size); do
+    while [ $(jobs -rp | wc -l) -ge $JOBS_LIMIT ]; do
+        sleep 1
+    done
+    echo "Running jobs $(jobs -rp | wc -l) $idx-of-$parallel_size";
+    srun -A $SLURM_ACCOUNT \
+        -p cpu,cpu_1,cpu_long -t 4:00:00 -J creating-WDS-$idx-of-$parallel_size \
+        --cpus-per-task 8 \
+        --mem-per-cpu 8G \
+        --dependency singleton \
+        -e slurm-logs/data/$idx-of-$parallel_size.err \
+        -o slurm-logs/data/$idx-of-$parallel_size.txt \
+        python llava/data/simple_vila_webdataset.py $workdir --shards=$idx --total=$parallel_size &
+done
+wait
+python llava/data/simple_vila_webdataset.py $workdir

VILA/demo_images/LongVILA-pipeline.png ADDED Viewed

Git LFS Details

SHA256: d29fdbb1cdf908a8053cf9ca19262aaf4823d51cd2c04567f8375af951f6cdd8
Pointer size: 131 Bytes
Size of remote file: 156 kB

VILA/demo_images/av.png ADDED Viewed

Git LFS Details

SHA256: 093f0838b946c86d932ca76ad5b0fc871609d1c49dba359a9380545d31b67ed3
Pointer size: 131 Bytes
Size of remote file: 384 kB

VILA/demo_images/demo_img_1.png ADDED Viewed

Git LFS Details

SHA256: 85765d45ea665ac4afbafbc5ce03fdcc23fd958d64b6da2038a1f6cce85a1541
Pointer size: 131 Bytes
Size of remote file: 142 kB

VILA/demo_images/demo_img_2.png ADDED Viewed

Git LFS Details

SHA256: 81b278a341259c01bc01b55effd6f61b6a2b12657305d644473a8ba5371861b9
Pointer size: 131 Bytes
Size of remote file: 715 kB

VILA/demo_images/demo_img_3.png ADDED Viewed

Git LFS Details

SHA256: 1e26e812858c4610bfebc33a5f42751db4b88cf948adab19a67e67d4865d1271
Pointer size: 131 Bytes
Size of remote file: 568 kB

VILA/demo_images/longvila-logo.png ADDED Viewed

Git LFS Details

SHA256: 41046d75a3bb9d3dde39781e0d204a4f9c58e5353feff6e712590bb8d1fb000d
Pointer size: 131 Bytes
Size of remote file: 157 kB

VILA/demo_images/vila-logo.jpg ADDED Viewed

VILA/demo_trt_llm/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ ## Deprecation Notice
2	+
3	+ This README is deprecated and is no longer being maintained. For the most up-to-date information and instructions, please refer to the [TensorRT-LLM example](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal#llava-and-vila) for VILA deployment.

VILA/inference_test/inference_test.json ADDED Viewed

	@@ -0,0 +1,546 @@

+{
+    "test_cases": [
+      {
+        "name": "top down view",
+        "image_paths": [
+          "more_samples/top_view.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\n What is unusual about this image?",
+            "expected_answer": "The unusual aspect of this image is that it is an aerial view of a busy freeway with many cars, and it appears to be taken from a helicopter. This perspective provides a unique and interesting perspective of the traffic, as it allows the viewer to see the entire freeway and all the cars on it from above. The image captures the bustling nature of the city and the movement of the vehicles, which is not easily visible from ground level."
+          }
+        ]
+      },
+      {
+        "name": "deer crossing",
+        "image_paths": [
+          "more_samples/deer_crossing.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\n What is unusual about this image?",
+            "expected_answer": "The unusual aspect of this image is that a group of deer is crossing a road in front of a car. Typically, deer are not expected to be seen crossing roads, especially in urban or suburban areas. This situation can pose a risk to both the deer and the people in the car, as the deer might not be aware of the approaching vehicle, and the driver may not have enough time to react and stop safely. It is important for drivers to be cautious and patient in such situations to avoid accidents and ensure the safety of both the animals and the people involved."
+          }
+        ]
+      },
+      {
+        "name": "windmine",
+        "image_paths": [
+          "more_samples/windmine.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\n What is in this image?",
+            "expected_answer": "The image features a large field with numerous wind turbines scattered throughout the area. The wind turbines are of varying heights and are positioned at different angles, creating a visually striking scene. The field is surrounded by a forest, adding to the natural beauty of the landscape."
+          },
+          {
+            "question": "<image>\n Do facilities here look normal or need maintenance?",
+            "expected_answer": "The facilities in the image, which are wind turbines, appear to be in good condition and functioning properly. They are situated in a large field with a clear blue sky, indicating that they are well-maintained and not in need of immediate maintenance. The presence of multiple wind turbines in the field suggests that they are part of a larger wind farm, which is likely to be managed and maintained by professionals to ensure optimal performance and efficiency."
+          }
+        ]
+      },
+      {
+        "name": "windworker",
+        "image_paths": [
+          "more_samples/windworker.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\n Can you describe the situation in this image?",
+            "expected_answer": "In the image, a person is climbing a large wind turbine, likely a windmill, to perform maintenance or repair work. The individual is using a harness and a rope to secure themselves to the turbine, ensuring their safety while they work. The scene captures the height and scale of the wind turbine, emphasizing the importance of the task being performed by the person."
+          },
+          {
+            "question": "<image>\n Is the worker well protected?",
+            "expected_answer": "Yes, the worker is well protected. In the image, the man is wearing a harness while climbing the side of a wind turbine. This harness provides safety and support, preventing the worker from falling and ensuring that they can safely perform their task. Additionally, the worker is likely using other safety equipment and following proper safety protocols to minimize the risk of accidents while working at such heights."
+          }
+        ]
+      },
+      {
+        "name": "construction crack",
+        "image_paths": [
+          "more_samples/construction_crack.webp"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\n Is the building in health condition?",
+            "expected_answer": "The building appears to be in a state of disrepair, with visible cracks and chips in the paint. This suggests that the building may be old, poorly maintained, or has experienced some form of damage or wear over time. The presence of these structural issues could potentially pose safety risks or negatively impact the building's overall appearance and value."
+          },
+          {
+            "question": "<image>\n What are the likely service needed for this building?",
+            "expected_answer": "The building in the image has a cracked wall and a damaged roof, which indicates that it requires repair and maintenance services. The cracked wall may require repairs to ensure structural integrity and prevent further damage. The damaged roof may need to be replaced or repaired to prevent water leakage and potential damage to the building's interior. Additionally, the building may need to be inspected for any other potential issues that may arise due to the damage."
+          }
+        ]
+      },
+      {
+        "name": "climate change",
+        "image_paths": [
+          "more_samples/climate_change.jpeg"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\n What is the implication of temperature based on this image?",
+            "expected_answer": "The image shows a comparison of the Arctic ice cover in 1984 and 2012. The implication of this image is that the Arctic ice cover has significantly decreased over the years, which is a result of climate change. The reduction in ice cover can have various implications, such as increased exposure of the Arctic environment to warmer temperatures, which can lead to the melting of sea ice, loss of habitat for polar bears and other Arctic species, and potential disruptions to the global climate system. This image serves as a visual representation of the impact of climate change on the Arctic environment and the need for action to mitigate its effects."
+          }
+        ]
+      },
+      {
+        "name": "patient status",
+        "image_paths": [
+          "more_samples/patient_status.jpeg"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\n What is the patient status?",
+            "expected_answer": "The patient is in a critical condition, as indicated by the presence of a ventilator and the fact that she is hooked up to a lot of wires."
+          },
+          {
+            "question": "<image>\n Is the patient asleep?",
+            "expected_answer": "Yes, the patient is asleep in the hospital bed."
+          }
+        ]
+      },
+      {
+        "name": "patient care",
+        "image_paths": [
+          "more_samples/patient_care.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\n What is the training session about?",
+            "expected_answer": "The training session is about learning how to perform CPR (cardiopulmonary resuscitation) on a mannequin. The group of people, including nurses and possibly other medical professionals, are gathered around a mannequin to practice their skills and learn the proper techniques for administering CPR."
+          },
+          {
+            "question": "<image>\n Will the practice cause pain to the patient in the image?",
+            "expected_answer": "The image shows a group of people, including nurses and a mannequin, practicing CPR on a mannequin. While practicing CPR is essential for healthcare professionals, it is important to note that it is a simulation and not a real patient. The practice does not cause pain to the patient in the image, as it is a mannequin and not a living person."
+          }
+        ]
+      },
+      {
+        "name": "Fuyu comparison",
+        "image_paths": [
+          "fuyu-capm.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\n Janet Ludlow’s firm requires all its analysts to use a two-stage dividend discount model (DDM) and the capital asset pricing model (CAPM) to value stocks. Using the CAPM and DDM, Ludlow has valued QuickBrush Company at $63 per share. She now must value SmileWhite Corporation. Calculate the required rate of return for SmileWhite by using the information in the table. Which rate is correct? A. 14% B. 15% C. 16%",
+            "expected_answer": "The required rate of return for SmileWhite can be calculated using the capital asset pricing model (CAPM), which is:\n\nRequired Return = Risk-free Rate + Beta x (Expected Market Return - Risk-free Rate)\n\nThe risk-free rate and expected market return are provided in the table, and the beta for SmileWhite is given as 1.15.\n\nLet’s calculate the required rate of return for SmileWhite:\n\nRequired Return = 4.50% + 1.15 x (14.50% - 4.50%)\nRequired Return = 4.50% + 1.15 x 10%\nRequired Return = 4.50% + 11.50%\nRequired Return = 16%\n\nTherefore, the answer is:\n\nC: 16%\nAnswer: C"
+          }
+        ]
+      },
+      {
+        "name": "Fuyu comparison",
+        "image_paths": [
+          "fuyu-heavy-table.png"
+        ],
+        "QAs": [
+          {
+            "question": "According to the following table, <image>\nwhich food is the most likely cause of the outbreak of food poisoning? (A) Cold chicken, (B) Potato salad, (C) Egg sandwiches, (D) Fruit pie and cream. Read the table in details and consider the percentage as the key factor. Think step-by-step.",
+            "expected_answer": "The table shows the number of people who ate different foods and the number of people who ate those foods and got sick. To determine which food is most likely the cause of the outbreak of food poisoning, we’re looking for the food with the highest percentage of people who ate that food and got sick.\n\nLet’s calculate the percentage of people who ate each food and got sick:\n\n- Cold chicken: 34 / 86 = 0.395, which is 39.5%\n- Potato salad: 38 / 54 = 0.704, which is 70.4%\n- Egg sandwiches: 40 / 76 = 0.526, which is 52.6%\n- Fruit pie and cream: 12 / 32 = 0.375, which is 37.5%\n- Cheese: 12 / 48 = 0.25, which is 25%\n\nThe food with the highest percentage of people who ate that food and got sick is potato salad, with 70.4%. Therefore, potato salad is the most likely cause of the outbreak of food poisoning.\n\nAnswer: B"
+          }
+        ]
+      },
+      {
+        "name": "flamingo comparison",
+        "image_paths": [
+          "flamingo_1.png",
+          "flamingo_2.png",
+          "flamingo_3.png"
+        ],
+        "QAs": [
+          {
+            "question": "Image 1: <image>\n Image 2: <image>\n Image 3: <image>\n What is the common thing about these three images?",
+            "expected_answer": "The common theme among these three images is the presence of a flamingo. In the first image, the flamingo is a 3D model, while in the second image, it is a pink cartoon. In the third image, it is a real flamingo standing in the water. All three images showcase the bird in different styles and formats, but they all share the same subject."
+          }
+        ]
+      },
+      {
+        "name": "princess comparison",
+        "image_paths": [
+          "princess_1.png",
+          "princess_2.png"
+        ],
+        "QAs": [
+          {
+            "question": "Image 1: <image>\n Image 2: <image>\n Tell me about the differences between image 1 and image 2.",
+            "expected_answer": "The two images feature a cartoon girl wearing a pink dress. In the first image, the girl is wearing a crown, while in the second image, she is wearing a tiara. Both images showcase the girl in a princess-like attire, but the difference lies in the type of headwear she is wearing. The crown is a more traditional and regal accessory, while the tiara is a more playful and common accessory associated with princesses in popular culture."
+          }
+        ]
+      },
+      {
+        "name": "order price calculation",
+        "image_paths": [
+          "menu_1.png",
+          "menu_2.png"
+        ],
+        "QAs": [
+          {
+            "question": "Photo: <image>\n Menu: <image>\n How much should I pay for all the beers on the table according to the price on the menu? Think step by step.",
+            "expected_answer": "The beer on the table is Magna, which is priced at $6.00. There are two bottles of Magna on the table. Therefore, the total cost of the beer should be $6.00 x 2 = $12.00."
+          }
+        ]
+      },
+      {
+        "name": "pizza price calculation",
+        "image_paths": [
+          "pizza.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nI am getting just two custom pizza for me and my friend. How much should I pay in total according to the image? Let's think step by step.",
+            "expected_answer": "The image shows a chalkboard menu with a special for today, which is to create your own pizza. The price for this special is $9.99. Since you are getting two custom pizzas, you should pay $9.99 x 2 = $19.98 for the two pizzas. So, the answer is $19.98."
+          }
+        ]
+      },
+      {
+        "name": "sign OCR",
+        "image_paths": [
+          "sign_1.png",
+          "sign_2.png",
+          "sign_3.png"
+        ],
+        "QAs": [
+          {
+            "question": "Image 1: <image>\nImage 2: <image>\nImage 3: <image>. Image 1 is Underground. Image 2 is Congress. What is Image 3?",
+            "expected_answer": "Soulomes."
+          }
+        ]
+      },
+      {
+        "name": "painting style",
+        "image_paths": [
+          "painting_1.png",
+          "painting_2.png",
+          "painting_3.png"
+        ],
+        "QAs": [
+          {
+            "question": "Image 1: <image>\nImage 2: <image>\nImage 3: <image>. Image 1 is Romanticism. Image 2 is Surrealism. What is Image 3?",
+            "expected_answer": "Impressionism"
+          }
+        ]
+      },
+      {
+        "name": "handwritten calculation",
+        "image_paths": [
+          "handwritten_1.png",
+          "handwritten_2.png",
+          "handwritten_3.png"
+        ],
+        "QAs": [
+          {
+            "question": "Image 1: <image>\nImage 2: <image>\nImage 3: <image>. Image 1 is 2+1=3. Image 2 is 5+6=11. What is Image 3?",
+            "expected_answer": "3x6=18"
+          }
+        ]
+      },
+      {
+        "name": "landmark Taipei",
+        "image_paths": [
+          "landmark_taipei.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nWhich city is this landmark in?",
+            "expected_answer": "The landmark in the image is located in Taipei, Taiwan."
+          }
+        ]
+      },
+      {
+        "name": "landmark new york 1",
+        "image_paths": [
+          "landmark_new_york_1.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nWhich city is this landmark in?",
+            "expected_answer": "This landmark is located in New York City."
+          }
+        ]
+      },
+      {
+        "name": "landmark new york 2",
+        "image_paths": [
+          "landmark_new_york_2.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nWhich city is this landmark in?",
+            "expected_answer": "The landmark in the image is located in New York City."
+          }
+        ]
+      },
+      {
+        "name": "landmark Kyoto",
+        "image_paths": [
+          "landmark_kyoto.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nWhich city is this landmark in?",
+            "expected_answer": "This landmark is located in Kyoto, Japan."
+          }
+        ]
+      },
+      {
+        "name": "visual inference",
+        "image_paths": [
+          "visual_inference.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nWhat is in the circled glass? Object 1 or 2? Describe what is Object 1 and object 2. Then check what is in the circled glass.",
+            "expected_answer": "Object 1 is a bottle of water, and Object 2 is a bottle of beer. The circled glass contains Object 1, which is a bottle of water."
+          }
+        ]
+      },
+      {
+        "name": "Gemini handdraw car",
+        "image_paths": [
+          "gemini_handdraw_car.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nWhich of these cars is more aerodynamic? The one on the left or the right? Explain why, using specific visual details.",
+            "expected_answer": "The car on the right is more aerodynamic than the one on the left. This is because the car on the right has a more streamlined shape, which reduces air resistance and drag. The car on the left has a more traditional, boxy shape, which increases air resistance and drag. The aerodynamic car on the right would be more efficient and faster, while the car on the left would be slower and less efficient."
+          }
+        ]
+      },
+      {
+        "name": "caption night view",
+        "image_paths": [
+          "caption_nightview.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nDescribe the image with as much detail as possible.",
+            "expected_answer": "The image captures a beautiful nighttime scene of a city skyline with a large body of water in front of it. The city is illuminated by the lights of various buildings, creating a vibrant atmosphere. The water is filled with numerous boats of different sizes."
+          }
+        ]
+      },
+      {
+        "name": "caption meat",
+        "image_paths": [
+          "caption_meat.jpeg"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nDescribe the image with as much detail as possible.",
+            "expected_answer": "The image is a surreal and colorful scene featuring a landscape made entirely of meat. The landscape is filled with various types of meat, including bacon, ham, and sausage, creating a unique and eye-catching scene. The meat is arranged in the form of mountains, hills, and valleys, giving the impression of a fantasy world. In addition to the meat, there are several birds scattered throughout the scene."
+          }
+        ]
+      },
+      {
+        "name": "company",
+        "image_paths": [
+          "company_1.png",
+          "company_2.png",
+          "company_3.png"
+        ],
+        "QAs": [
+          {
+            "question": "Image 1: <image>\nImage 2: <image>\nImage 3: <image>. In Image 1, The company is famous for its search engine. In Image 2, The company is famous for iPhone and Mac. What is the company in Image 3 famouns for?",
+            "expected_answer": "The company is famous for its graphics processing units."
+          }
+        ]
+      },
+      {
+        "name": "count animal",
+        "image_paths": [
+          "count_panda_3.png",
+          "count_dog_2.png",
+          "count_giraff_4.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image> pandas: 3.\n <image> dogs:2. <image>",
+            "expected_answer": "giraffes: 4"
+          }
+        ]
+      },
+      {
+        "name": "french",
+        "image_paths": [
+          "french_1.png",
+          "french_2.png",
+          "french_3.png"
+        ],
+        "QAs": [
+          {
+            "question": "Image 1: <image> Les sanglots longs des violons de l’automne blessent mon coeur d’une langueur monotone. \n Image 2: <image> Pour qui sont ces serpents qui sifflent sur vos têtes? \n Image 3: <image>",
+            "expected_answer": "Les flamands roses s'embrassent avec passion, leurs cœurs se touchant, leur amour se partageant."
+          }
+        ]
+      },
+      {
+        "name": "meme",
+        "image_paths": [
+          "meme.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nCan you explain the meme?",
+            "expected_answer": "The meme depicts a man's reaction to the price of a computer graphics card. In the first image, the man is smiling and appears excited about the product. In the second image, he is shocked and disappointed by the high price of the graphics card, which is $1,200. The meme is a playful representation of the contrast between the man's initial enthusiasm and his subsequent disappointment upon learning the cost of the product."
+          }
+        ]
+      },
+      {
+        "name": "flying chair",
+        "image_paths": [
+          "flying_chair.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nWhat is unusual about this image?",
+            "expected_answer": "The unusual aspect of this image is that a chair is flying through the air on a highway, seemingly coming out of the back of a truck."
+          },
+          {
+            "question": "<image>\nWhat should you do if you encounter this?",
+            "expected_answer": "If you encounter this situation, you should immediately stop your vehicle and move to a safe distance from the truck and the flying chair. It is essential to avoid any potential hazards and contact the authorities to report the incident and ensure the safety of everyone involved."
+          }
+        ]
+      },
+      {
+        "name": "palm_e",
+        "image_paths": [
+          "palm_e_1.png",
+          "palm_e_2.png",
+          "palm_e_3.png"
+        ],
+        "QAs": [
+          {
+            "question": "Image 1: <image>\nImage 2: <image>\nImage 3: <image>. Image 1: at 10:30 am. Image 2: at 12:45 pm. Image3: at 3:45 pm. What did I have for lunch, and what time was it?",
+            "expected_answer": "I had a sandwich for lunch, and it was at 12:45 pm."
+          }
+        ]
+      },
+      {
+        "name": "orange price",
+        "image_paths": [
+          "orange_price.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nWhat's the price for a single orange? Look at the price tag in details.",
+            "expected_answer": "$1.25"
+          }
+        ]
+      },
+      {
+        "name": "tow car",
+        "image_paths": [
+          "tow_car.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nWhat's the person doing?",
+            "expected_answer": "The person is lying on the ground next to a car, possibly working on it or inspecting it."
+          }
+        ]
+      },
+      {
+        "name": "parking sign",
+        "image_paths": [
+          "parking_sign.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nHow long can I park here 5pm on Mondays? Look at the traffic signs in details.",
+            "expected_answer": "After 5pm on Monday, you can park for 1 hour."
+          }
+        ]
+      },
+      {
+        "name": "car block",
+        "image_paths": [
+          "car_blocker.png"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\n Look at the traffic condition, can the vehicle proceed now? Why?",
+            "expected_answer": "Based on the image, the vehicle cannot proceed through the traffic yet. There are multiple people and bicycles in the crosswalk, and the traffic light is red. The vehicle must wait for the traffic light to turn green before proceeding."
+          }
+        ]
+      },
+      {
+        "name": "car safety",
+        "image_paths": [
+          "car_safety.jpg"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nIs the driver on the phone? ",
+            "expected_answer": "Yes, the driver is on the phone."
+          },
+          {
+            "question": "<image>\nHow many people are in the car?",
+            "expected_answer": "There are two people in the car with one person driving and the other in the back of the car."
+          },
+          {
+            "question": "<image>\nIs the driver distracted?",
+            "expected_answer": "Yes, the driver is distracted as he is holding a cell phone interacting with it while sitting on the driver’s seat."
+          },
+          {
+            "question": "<image>\nWhere is the passenger sitting?",
+            "expected_answer": "The passenger is sitting on the right side of the car."
+          },
+          {
+            "question": "<image>\nWhat is on the passenger seat? Is it safe?",
+            "expected_answer": "There is a pair of scissors on the passenger seat. It is not safe."
+          }
+        ]
+      },
+      {
+        "name": "factory",
+        "image_paths": [
+          "factory.jpg"
+        ],
+        "QAs": [
+          {
+            "question": "<image>\nHow many cars are jacked up?",
+            "expected_answer": "There are two cars jacked up in the image."
+          },
+          {
+            "question": "<image>\nWhat is the person whose head is under the jacked up car doing?",
+            "expected_answer": "The person whose head is under the jacked up car is likely performing a task related to the maintenance or repair of the vehicle. They could be inspecting the suspension, brakes, or other components of the car that require attention. The other people in the scene are also working on the vehicles, suggesting that they are part of a team or a group of mechanics or technicians who are collaborating to fix or maintain the cars."
+          },
+          {
+            "question": "<image>\nHow many people are there whose head is under the jacked up car?",
+            "expected_answer": "There are two persons whose head is under the jacked up car."
+          }
+        ]
+      },
+      {
+        "name": "factory count",
+        "image_paths": [
+          "factory_count_1.jpg",
+          "factory_count_2.jpg",
+          "factory_count_3.jpg",
+          "factory_count_4.jpg",
+          "factory_count_5.jpg",
+          "factory_count_6.jpg",
+          "factory_count_7.jpg",
+          "factory_count_8.jpg"
+        ],
+        "QAs": [
+          {
+            "question": "Frame 1: <image>\n Frame 2: <image>\n Frame 2: <image>\n Frame 4: <image>\n Frame 5: <image>\n Frame 6: <image>\n Frame 7: <image>\n Frame 8: <image>\n Considering the video frames, how many chip bags are picked up?",
+            "expected_answer": "Two chip bags are picked up."
+          }
+        ]
+      }
+    ]
+  }

VILA/inference_test/inference_test.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+Inference test to run all examples from the paper and compare w/ expected output.
+Both the inference results and expected output will be printed out.
+Currently do not support multi-turn chat. Each time an image and question are input and answer is output.
+"""
+import argparse
+import json
+import os
+import torch
+from PIL import Image
+from llava.constants import IMAGE_TOKEN_INDEX
+from llava.conversation import SeparatorStyle, conv_templates
+from llava.mm_utils import (KeywordsStoppingCriteria, process_images,
+                            tokenizer_image_token)
+from llava.model import *
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+from llava.model.builder import load_pretrained_model
+def eval_model(args, model, tokenizer, image_processor):
+    # read json file
+    with open(args.test_json_path) as f:
+        all_test_cases = json.load(f)
+    result_list = []
+    print(len(all_test_cases["test_cases"]))
+    for test_case in all_test_cases["test_cases"]:
+        # read images first
+        image_file_list = test_case["image_paths"]
+        image_list = [
+            Image.open(os.path.join(args.test_image_path, image_file)).convert("RGB") for image_file in image_file_list
+        ]
+        image_tensor = process_images(image_list, image_processor, model.config)
+        # image_tokens = DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
+        for i in range(len(test_case["QAs"])):
+            query = test_case["QAs"][i]["question"]
+            query_text = query
+            if 1:
+                # query = query.replace("<image>", image_tokens)
+                if len(image_list) < 3:
+                    conv = conv_templates["vicuna_v1"].copy()
+                else:
+                    conv = conv_templates["vicuna_v1_nosys"].copy()
+                conv.append_message(conv.roles[0], query)
+                conv.append_message(conv.roles[1], None)
+                prompt = conv.get_prompt()
+            else:
+                conv = conv_templates[args.conv_mode].copy()
+                if not "<image>" in query:
+                    assert "###" not in query  # single query
+                    query = image_tokens + "\n" + query  # add <image>
+                    query_list = [query]
+                else:
+                    query_list = query.split("###")
+                    assert len(query_list) % 2 == 1  # the last one is from human
+                    new_query_list = []
+                    for idx, query in enumerate(query_list):
+                        if "<image>" in query:
+                            assert idx % 2 == 0  # only from human
+                            # assert query.startswith("<image>")
+                        # query = query.replace("<image>", image_tokens)
+                        new_query_list.append(query)
+                    query_list = new_query_list
+                for idx, query in enumerate(query_list):
+                    conv.append_message(conv.roles[idx % 2], query)
+                conv.append_message(conv.roles[1], None)
+                prompt = conv.get_prompt()
+            print("%" * 10 + " " * 5 + "VILA Response" + " " * 5 + "%" * 10)
+            # inputs = tokenizer([prompt])
+            inputs = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX)
+            input_ids = torch.as_tensor(inputs).cuda().unsqueeze(0)
+            stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+            keywords = [stop_str]
+            stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+            # outputs = run_llava.process_outputs(args, model, tokenizer, input_ids, image_tensor, stopping_criteria, stop_str)
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=image_tensor.to(dtype=torch.float16, device="cuda", non_blocking=True),
+                    do_sample=True if args.temperature > 0 else False,
+                    temperature=args.temperature,
+                    top_p=0.7,
+                    # top_p=args.top_p,
+                    # num_beams=args.num_beams,
+                    max_new_tokens=512,
+                    # use_cache=True,
+                    stopping_criteria=[stopping_criteria],
+                )
+            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+            outputs = outputs.strip()
+            print(f"Question: {query_text}")
+            print(f"VILA output: {outputs}")
+            print(f'Expected output: {test_case["QAs"][i]["expected_answer"]}')
+            result_list.append(
+                dict(question=query_text, output=outputs, expected_output=test_case["QAs"][i]["expected_answer"])
+            )
+    return result_list
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, default=None)
+    parser.add_argument("--test_json_path", type=str, default=None)
+    parser.add_argument("--test_image_path", type=str, default=None)
+    parser.add_argument("--conv-mode", type=str, default=None)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--pad", action="store_true")
+    args = parser.parse_args()
+    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_name, "llava_llama", None)
+    result_list = eval_model(args, model, tokenizer, image_processor)
+    save_name = f"inference-test_{args.model_name.split('/')[-1]}"
+    if "nosys" in args.conv_mode:
+        save_name += "_nosys"
+    save_name += ".json"
+    result_list_str = json.dumps(result_list, indent=2)

VILA/llava.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,287 @@

+Metadata-Version: 2.1
+Name: llava
+Version: 1.0.0
+Summary: VILA: On Pre-training for Visual Language Models
+Project-URL: Homepage, https://hanlab.mit.edu/projects/vila
+Project-URL: Bug Tracker, https://github.com/Efficient-Large-Model/VILA-Internal/issues
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: Apache Software License
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch==2.0.1
+Requires-Dist: torchvision==0.15.2
+Requires-Dist: transformers==4.31.0
+Requires-Dist: tokenizers<0.14,>=0.12.1
+Requires-Dist: sentencepiece==0.1.99
+Requires-Dist: shortuuid
+Requires-Dist: accelerate==0.27.2
+Requires-Dist: peft==0.5.0
+Requires-Dist: bitsandbytes==0.41.0
+Requires-Dist: pydantic<2,>=1
+Requires-Dist: markdown2[all]
+Requires-Dist: numpy
+Requires-Dist: scikit-learn==1.2.2
+Requires-Dist: gradio==3.35.2
+Requires-Dist: gradio_client==0.2.9
+Requires-Dist: requests
+Requires-Dist: httpx==0.24.0
+Requires-Dist: uvicorn
+Requires-Dist: fastapi
+Requires-Dist: einops==0.6.1
+Requires-Dist: einops-exts==0.0.4
+Requires-Dist: timm==0.6.13
+Requires-Dist: openpyxl==3.1.2
+Requires-Dist: pytorchvideo==0.1.5
+Requires-Dist: datasets==2.16.1
+Requires-Dist: openai==1.8.0
+Requires-Dist: webdataset==0.2.86
+Provides-Extra: train
+Requires-Dist: deepspeed==0.13.2; extra == "train"
+Requires-Dist: ninja; extra == "train"
+Requires-Dist: wandb; extra == "train"
+Provides-Extra: eval
+Requires-Dist: mmengine; extra == "eval"
+Requires-Dist: word2number; extra == "eval"
+Requires-Dist: Levenshtein; extra == "eval"
+<p align="center">
+  <img src="demo_images/vila-logo.jpg" width="20%"/>
+</p>
+# VILA: On Pre-training for Visual Language Models
+[![Code License](https://img.shields.io/badge/Code%20License-Apache_2.0-green.svg)](CODE_LICENSE)
+[![Model License](https://img.shields.io/badge/MODEL%20License-CC%20By%20NC%204.0-red.svg)](MODEL_LICENSE)
+[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/)
+[VILA arxiv](https://arxiv.org/abs/2312.07533) / [VILA Demo](https://vila-demo.hanlab.ai/) / [VILA Huggingface](https://huggingface.co/collections/Efficient-Large-Model/vila-on-pre-training-for-visual-language-models-65d8022a3a52cd9bcd62698e)
+## 💡 Introduction
+VILA is a visual language model (VLM) pretrained with interleaved image-text data at scale, enabling multi-image VLM. VILA is deployable on the edge, including Jetson Orin and laptop by [AWQ](https://arxiv.org/pdf/2306.00978.pdf) 4bit quantization through [TinyChat](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat) framework. We find: (1) image-text pairs are not enough, interleaved image-text is essential; (2) unfreezing LLM during interleaved image-text pre-training enables in-context learning; (3)re-blending text-only instruction data is crucial to boost both VLM and text-only performance. VILA unveils appealing capabilities, including: multi-image reasoning, in-context learning, visual chain-of-thought, and better world knowledge.
+## 💡 News
+- [2024/02] We release [AWQ](https://arxiv.org/pdf/2306.00978.pdf)-quantized 4bit VILA models, deployable on Jetson Orin and laptops through [TinyChat](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat) and [TinyChatEngine](https://github.com/mit-han-lab/TinyChatEngine).
+- [2024/02] VILA is released. We propose interleaved image-text pretraining that enables multi-image VLM. VILA comes with impressive in-context learning capabilities. We open source everything: including training code, evaluation code, datasets, model ckpts.
+- [2023/12] [Paper](https://arxiv.org/abs/2312.07533) is on Arxiv!
+## Performance
+| $~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~$ | Prec. | VQAv2 | GQA  | VizWiz  | SQA-I  | VQA-T | POPE  | MME     | MMB  | MMB-CN | SEED | llava-bench | MM-Vet | Average (w/o MME) |
+| ----------------- | ---------------- | ---------------- | ---------- | ----------- | ----------- | ----- | ----- | ------- | ---- | ------ | ---- | ----------- | ------ | ----------------- |
+| VILA-7B           | fp16 | 80.3 | 63.1 | 59.6 | 68.0    | 62.6  | 86.3  | 1489.4 | 69.8 | 61.0     | 61.7 | 75.2        | 35.1   | 65.7              |
+| VILA-7B-AWQ       | int4 | 80.1             | 63.0 | 57.8              | 68.0    | 61.9  | 85.3  | 1486.3 | 68.8 | 59.0     | 61.3 | 75.8        | 35.9   | 65.2              |
+| VILA-13B          | fp16| 80.5             | 63.6 | 63.1              | 70.5  | 64.0    | 86.3  | 1553.6  | 73.8 | 66.7   | 62.8 | 78.3        | 42.6   | 68.4              |
+| VILA-13B-AWQ      | int4 | 80.4             | 63.6 | 63.0              | 71.2  | 63.5  | 87.0 | 1552.9  | 73.6 | 66.3   | 62.2 | 77.6        | 42.0   | 68.2              |
+<sup>NOTE: The benchmark results are slightly different from what we report in the paper due to refactoring of the codebase based on LLava-1.5 and re-train the model. VQAV2 and VizWiz are test-dev.</sup>
+### Inference speed ( Token/sec )
+| $~~~~~~$ | Precision |  A100 | 4090 | Orin |
+| --- | --- |--- | --- | --- |
+| VILA-7B | fp16 | 81.6 | 58.5 | 11.5 |
+| VILA-7B-AWQ| int4  |155.3| 168.1| 35.6 |
+| VILA-13B | fp16 | 48.5 | OOM | 6.1 |
+| VILA-13B-AWQ | int4  | 102.1| 99.0| 17.5 |
+## VILA Examples
+### In context learning
+<img src="demo_images/demo_img_1.png" height="239">
+<img src="demo_images/demo_img_2.png" height="250">
+### Multi-image reasoning
+<img src="demo_images/demo_img_3.png" height="193">
+### VILA on Jetson Orin
+https://github.com/Efficient-Large-Model/VILA/assets/7783214/6079374c-0787-4bc4-b9c6-e1524b4c9dc4
+### VILA on RTX 4090
+https://github.com/Efficient-Large-Model/VILA/assets/7783214/80c47742-e873-4080-ad7d-d17c4700539f
+</details>
+## Installation
+```bash
+./environment_setup.sh
+```
+or follow the instructions below in order.
+```
+conda create -n vila python=3.10 -y
+conda activate vila
+pip install --upgrade pip  # enable PEP 660 support
+wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.4.2/flash_attn-2.4.2+cu118torch2.0cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+pip install flash_attn-2.4.2+cu118torch2.0cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+pip install -e .
+pip install -e ".[train]"
+pip install git+https://github.com/huggingface/transformers@v4.38.1
+cp -r ./llava/train/transformers_replace/* ~/anaconda3/envs/vila/lib/python3.10/site-packages/transformers/
+```
+## Training
+VILA training contains three steps
+### Step-1: Alignment
+We utilize LLaVA-CC3M-Pretrain-595K dataset to align the textual and visual modalities.
+The stage 1 script takes in two parameters and it can run on a single 8xA100 node. `BASE_MODEL_PATH` points to a online or local huggingface repository, such as `NousResearch/Llama-2-7b-hf`. `OUTPUT_NAME` points to a target directory under `checkpoints`, which will save the trained multimodal projector afterwards.
+```bash
+bash scripts/v1_5/paper/1_mm_align.sh [BASE_MODEL_PATH] [OUTPUT_NAME]
+```
+| Hyperparameter | Global Batch Size | Learning rate | Epochs | Max length | Weight decay |
+| --- | ---: | ---: | ---: | ---: | ---: |
+| VILA-7B | 256 | 2e-5 | 1 | 4096 | 0 |
+| VILA-13B | 256 | 2e-5 | 1 | 4096 | 0 |
+### Step-2: Pretraining
+We use MMC4 and Coyo dataset to train VLM with interleaved image-text pairs.
+```bash
+bash scripts/v1_5/paper/2_pretrain_mmc4_coyo.sh [CODE_PATH] [BASE_MODEL_PATH] [STAGE1_PATH] [OUTPUT_NAME]
+```
+The stage 2 script takes in four arguments. `CODE_PATH` is the absolute path to our VILA codebase, `BASE_MODEL_PATH` has similar meaning to what is presented in the stage 1 script. `STAGE1_PATH` points to the `OUTPUT_NAME` of stage 1 (i.e. where the stage 1 checkpoint is stored). `OUTPUT_NAME` is the desired folder name under `checkpoints` that saves the pretraining checkpoint. The script we provided for this stage is executed on slurm, and we expect it to execute on 16 nodes (128 GPUs).
+| Hyperparameter | Global Batch Size | Learning rate | Epochs | Max length | Weight decay |
+| --- | ---: | ---: | ---: | ---: | ---: |
+| VILA-7B | 1024 | 5e-5 | 1 | 4096 | 0 |
+| VILA-13B | 1024 | 5e-5 | 1 | 4096 | 0 |
+### Step-3: Supervised fine-tuning
+This is the last stage of VILA training, in which we tune the model to follow multimodal instructions on a subset of M3IT, FLAN and ShareGPT4V. This stage runs on a 8xA100 node.
+```bash
+bash scripts/v1_5/paper/3_sft.sh [STAGE2_PATH] [OUTPUT_NAME]
+```
+The stage 3 script takes in two arguments. `STAGE2_PATH` points to the `OUTPUT_NAME` of the stage 2 script (i.e. where the stage 2 checkpoint is stored). `OUTPUT_NAME` is the desired folder name under `checkpoints` that stores the final checkpoint.
+| Hyperparameter | Global Batch Size | Learning rate | Epochs | Max length | Weight decay |
+| --- | ---: | ---: | ---: | ---: | ---: |
+| VILA-7B | 128 | 2e-5 | 1 | 4096 | 0 |
+| VILA-13B | 128 | 2e-5 | 1 | 4096 | 0 |
+### Training with fewer GPUs
+To train with fewer GPUs/nodes, you can reduce the `per_device_train_batch_size` and increase the `gradient_accumulation_steps` accordingly.  As long as the global batch size same (`per_device_train_batch_size` x `gradient_accumulation_steps` x `num_gpus`) are kept the same, the training precision will not be affected.
+Stage 1 completes within 3.5 (7B) - 5.5 (13B) hours on 8xA100, Stage 2 completes within 30 hours on 128xA100 for VILA-7B, and stage 3 completes in 25 (7B) - 40 (13B) hours on 8xA100.
+See [data_prepare/README.md](data_prepare/README.md) for more information about how to prepare datasets.
+## Evaluations
+You can follow [Llava1.5 eval](https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md) to download all datasets. After downloading all datasets, please put them under `playground/data/eval`.
+We provide a push-the-button script to perform evaluation on all 10 datasets that do not require GPT-assisted evaluation:
+```bash
+./scripts/v1_5/eval/eval_all.sh [CHECKPOINT_PATH] [MODEL_NAME]
+```
+This script takes in two parameters, `CHECKPOINT_PATH` points to the stage 3 model checkpoint, and `MODEL_NAME` will be the name of evaluation results.
+[VQAv2](https://eval.ai/web/challenges/challenge-page/830/my-submission) and [Vizwiz](https://eval.ai/web/challenges/challenge-page/2185/my-submission) evaluations are hosted on eval.ai. You need to register an account and create a team to be able to submit eval.
+MMBench and MMBench_CN eval are hosted on another [evaluation server](https://opencompass.org.cn/leaderboard-multimodal). Make sure you change the name of the file before submitting, otherwise the server caches results and will always return wrong result to you.
+We provide a quick script to automatically organize the prediction files that need to be submitted to servers:
+```bash
+python scripts/v1_5/eval/copy_predictions.py [MODEL_NAME]
+```
+You will be able to find the predictions under `playground/data/predictions_upload/[MODEL_NAME]` after executing this script.
+## Inference
+We provide snippets for quick inference with user prompts and images.
+VILA-7B inference:
+```bash
+python -W ignore llava/eval/run_llava.py \
+    --model-name Efficient-Large-Model/VILA-7B \
+    --conv-mode vicuna_v1 \
+    --query "<image>\n Please describe the traffic condition." \
+    --image-file "av.png"
+```
+VILA-13B inference:
+```bash
+python -W ignore llava/eval/run_llava.py \
+    --model-name Efficient-Large-Model/VILA-13B \
+    --conv-mode vicuna_v1 \
+    --query "<image>\n Please describe the traffic condition." \
+    --image-file "av.png"
+```
+## Quantization and Deployment
+Our VILA models are quantized by [AWQ](https://arxiv.org/abs/2306.00978) into 4 bits for efficient inference on the edge. We provide a push-the-button [script](https://github.com/mit-han-lab/llm-awq/blob/main/scripts/vila_example.sh) to quantize VILA with AWQ.
+### Running VILA on desktop GPUs and edge GPUs
+We support AWQ-quantized 4bit VILA on GPU platforms via [TinyChat](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat). We provide a [tutorial](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat#support-vlm-models-vila--llava) to run the model with TinyChat after quantization. We also provide an [instruction](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat/serve) to launch a Gradio server (powered by TinyChat and AWQ) to serve 4-bit quantized VILA models.
+### Running VILA on laptops
+We further support our AWQ-quantized 4bit VILA models on various CPU platforms with both x86 and ARM architectures with our [TinyChatEngine](https://github.com/mit-han-lab/TinyChatEngine). We also provide a detailed [tutorial](https://github.com/mit-han-lab/TinyChatEngine/tree/main?tab=readme-ov-file#deploy-vision-language-model-vlm-chatbot-with-tinychatengine) to help the users deploy VILA on different CPUs.
+## Checkpoints
+We release [VILA-7B](https://hf.co/Efficient-Large-Model/VILA-7b), [VILA-13B](https://hf.co/Efficient-Large-Model/VILA-13b), [VILA-7B-4bit-AWQ](https://hf.co/Efficient-Large-Model/VILA-7b-4bit-awq) and [VILA-13B-4bit-AWQ](https://hf.co/Efficient-Large-Model/VILA-13b-4bit-awq).
+## 🔒 License
+- The code is released under the Apache 2.0 license as found in the [LICENSE](./LICENSE) file.
+- The pretrained weights are released under the [CC-BY-NC-SA-4.0 license](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en).
+- The service is a research preview intended for non-commercial use only, and is subject to the following licenses and terms:
+    - [Model License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA
+    - [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI
+    - [Dataset Licenses](./data_prepare/LICENSE) for each one used during training.
+## Team
+| | | |
+| --- | --- | ---|
+[*Ji Lin](https://www.linji.me/): OpenAI (work done at Nvidia and MIT) |  [*Hongxu Yin](https://hongxu-yin.github.io/): Nvidia |  [*Yao Lu](https://scholar.google.com/citations?user=OI7zFmwAAAAJ&hl=en): Nvidia
+[Wei Ping](https://scholar.google.com/citations?user=6gKEYRgAAAAJ&hl=en): Nvidia |   [Pavlo Molchanov](https://www.pmolchanov.com/): Nvidia |  [Andrew Tao](https://scholar.google.com/citations?user=Wel9l1wAAAAJ&hl=en): Nvidia |
+[Haotian Tang](http://kentang.net/): MIT |  [Shang Yang](https://ys-2020.github.io/): MIT |  [Ligeng Zhu](https://lzhu.me/): Nvidia, MIT |
+[Wei-Chen Wang](https://weichenwang.me/): MIT |  [Fuzhao Xue](https://xuefuzhao.github.io/): Nvidia, NUS |  [Yunhao Fang](https://seerkfang.github.io/): Nvidia, UCSD |
+[Yukang Chen](https://yukangchen.com/): Nvidia, CUHK |  [Yue Shen](https://www.linkedin.com/in/yue-james-shen/): Nvidia | [Huizi Mao](https://scholar.google.com/citations?user=r5WezOYAAAAJ&hl=zh-CN): Nvidia |
+[Jan Kautz](https://jankautz.com/): Nvidia  |   [Mohammad Shoeybi](https://scholar.google.com/citations?user=62ElavIAAAAJ&hl=en): Nvidia |  [Song Han](http://songhan.mit.edu/): Nvidia, MIT
+## Citations
+```
+@misc{lin2023vila,
+      title={VILA: On Pre-training for Visual Language Models},
+      author={Ji Lin and Hongxu Yin and Wei Ping and Yao Lu and Pavlo Molchanov and Andrew Tao and Huizi Mao and Jan Kautz and Mohammad Shoeybi and Song Han},
+      year={2023},
+      eprint={2312.07533},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+# Acknowledgement
+- [LLaVA](https://github.com/haotian-liu/LLaVA): the codebase we built upon. Thanks for their wonderful work.
+- [Vicuna](https://github.com/lm-sys/FastChat): the amazing open-sourced large language model!
+- [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT): we borrowed video evaluation script from this repository.
+- [MMC4](https://github.com/allenai/mmc4), [COYO-700M](https://github.com/kakaobrain/coyo-dataset), [M3IT](https://huggingface.co/datasets/MMInstruction/M3IT), [OpenORCA/FLAN](https://huggingface.co/datasets/Open-Orca/FLAN), [ShareGPT4V](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4V) for providing datasets used in this research.

VILA/llava.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,154 @@

+LICENSE
+README.md
+pyproject.toml
+CIs/send_email.py
+data_prepare/coyo/coyo_downloader.py
+data_prepare/coyo/coyo_splitter.py
+data_prepare/mmc4/mmc4_downloader.py
+data_prepare/mmc4/mmc4_filter_and_counter.py
+data_prepare/mmc4/mmc4_merger.py
+data_prepare/sft/preprocess_flan.py
+data_prepare/sft/preprocess_m3it.py
+data_prepare/sft/split_vflan.py
+demo_trt_llm/llava.py
+demo_trt_llm/test_vila.py
+inference_test/dataset_test.py
+inference_test/inference_test.py
+llava/__init__.py
+llava/constants.py
+llava/conversation.py
+llava/mm_utils.py
+llava/unit_test_utils.py
+llava/utils.py
+llava.egg-info/PKG-INFO
+llava.egg-info/SOURCES.txt
+llava.egg-info/dependency_links.txt
+llava.egg-info/requires.txt
+llava.egg-info/top_level.txt
+llava/data/__init__.py
+llava/data/dataset.py
+llava/data/dataset_tar.py
+llava/data/datasets_mixture.py
+llava/data/simple_video_dataset.py
+llava/data/simple_vila_webdataset.py
+llava/data/dataset_impl/coyo_recap.py
+llava/data/dataset_impl/sam.py
+llava/data_aug/caption2qa.py
+llava/data_aug/dev.py
+llava/data_aug/reformat_tar.py
+llava/eval/eval_gpt_review.py
+llava/eval/eval_gpt_review_bench.py
+llava/eval/eval_gpt_review_visual.py
+llava/eval/eval_mathvista.py
+llava/eval/eval_mmmu.py
+llava/eval/eval_mmvet.py
+llava/eval/eval_pope.py
+llava/eval/eval_science_qa.py
+llava/eval/eval_science_qa_gpt4.py
+llava/eval/eval_science_qa_gpt4_requery.py
+llava/eval/eval_textvqa.py
+llava/eval/evaluate_vqa.py
+llava/eval/generate_webpage_data_from_table.py
+llava/eval/m4c_evaluator.py
+llava/eval/model_qa.py
+llava/eval/model_vqa.py
+llava/eval/model_vqa_loader.py
+llava/eval/model_vqa_mmbench.py
+llava/eval/model_vqa_mmmu.py
+llava/eval/model_vqa_qbench.py
+llava/eval/model_vqa_science.py
+llava/eval/model_vqa_video.py
+llava/eval/qa_baseline_gpt35.py
+llava/eval/run_llava.py
+llava/eval/summarize_gpt_review.py
+llava/eval/mathvista_utils/calculate_score.py
+llava/eval/mathvista_utils/extract_answer.py
+llava/eval/mathvista_utils/utilities.py
+llava/eval/mathvista_utils/prompts/ext_ans.py
+llava/eval/mmmu_utils/data_utils.py
+llava/eval/mmmu_utils/eval_utils.py
+llava/eval/mmmu_utils/model_utils.py
+llava/eval/video/eval_benchmark_1_correctness.py
+llava/eval/video/eval_benchmark_2_detailed_orientation.py
+llava/eval/video/eval_benchmark_3_context.py
+llava/eval/video/eval_benchmark_4_temporal.py
+llava/eval/video/eval_benchmark_5_consistency.py
+llava/eval/video/eval_video_qa.py
+llava/model/__init__.py
+llava/model/apply_delta.py
+llava/model/builder.py
+llava/model/consolidate.py
+llava/model/llava_arch.py
+llava/model/make_delta.py
+llava/model/utils.py
+llava/model/language_model/llava_gemma.py
+llava/model/language_model/llava_llama.py
+llava/model/language_model/llava_mistral.py
+llava/model/language_model/llava_mixtral.py
+llava/model/language_model/llava_mpt.py
+llava/model/language_model/mpt/adapt_tokenizer.py
+llava/model/language_model/mpt/attention.py
+llava/model/language_model/mpt/blocks.py
+llava/model/language_model/mpt/configuration_mpt.py
+llava/model/language_model/mpt/custom_embedding.py
+llava/model/language_model/mpt/flash_attn_triton.py
+llava/model/language_model/mpt/hf_prefixlm_converter.py
+llava/model/language_model/mpt/meta_init_context.py
+llava/model/language_model/mpt/modeling_mpt.py
+llava/model/language_model/mpt/norm.py
+llava/model/language_model/mpt/param_init_fns.py
+llava/model/multimodal_encoder/builder.py
+llava/model/multimodal_encoder/clip_encoder.py
+llava/model/multimodal_encoder/siglip_encoder.py
+llava/model/multimodal_encoder/vision_encoder.py
+llava/model/multimodal_encoder/radio/__init__.py
+llava/model/multimodal_encoder/radio/cls_token.py
+llava/model/multimodal_encoder/radio/create_model.py
+llava/model/multimodal_encoder/radio/enable_cpe_support.py
+llava/model/multimodal_encoder/radio/enable_spectral_reparam.py
+llava/model/multimodal_encoder/radio/extra_timm_models.py
+llava/model/multimodal_encoder/radio/radio_encoder.py
+llava/model/multimodal_encoder/radio/token_merging.py
+llava/model/multimodal_encoder/radio/vit_patch_generator.py
+llava/model/multimodal_projector/builder.py
+llava/train/args.py
+llava/train/llava_trainer.py
+llava/train/short_video_filter.py
+llava/train/slurm_utils.py
+llava/train/train.py
+llava/train/train_mem.py
+llava/train/train_xformers.py
+llava/train/transformer_normalize_monkey_patch.py
+llava/train/utils.py
+llava/train/transformers_replace/trainer.py
+llava/train/transformers_replace/models/gemma/__init__.py
+llava/train/transformers_replace/models/gemma/configuration_gemma.py
+llava/train/transformers_replace/models/gemma/modeling_gemma.py
+llava/train/transformers_replace/models/llama/configuring_llama.py
+llava/train/transformers_replace/models/llama/modeling_llama.py
+llava/train/transformers_replace/models/llama/tokenization_llama.py
+llava/train/transformers_replace/models/mistral/__init__.py
+llava/train/transformers_replace/models/mistral/configuration_mistral.py
+llava/train/transformers_replace/models/mistral/modeling_mistral.py
+llava/train/transformers_replace/models/mixtral/__init__.py
+llava/train/transformers_replace/models/mixtral/configuration_mixtral.py
+llava/train/transformers_replace/models/mixtral/modeling_mixtral.py
+llava/train/transformers_replace/models/siglip/__init__.py
+llava/train/transformers_replace/models/siglip/configuration_siglip.py
+llava/train/transformers_replace/models/siglip/convert_siglip_to_hf.py
+llava/train/transformers_replace/models/siglip/image_processing_siglip.py
+llava/train/transformers_replace/models/siglip/modeling_siglip.py
+llava/train/transformers_replace/models/siglip/processing_siglip.py
+llava/train/transformers_replace/models/siglip/tokenization_siglip.py
+llava/wids/__init__.py
+llava/wids/wids.py
+llava/wids/wids_bench.py
+llava/wids/wids_cleanup.py
+llava/wids/wids_dir.py
+llava/wids/wids_dl.py
+llava/wids/wids_index.py
+llava/wids/wids_lru.py
+llava/wids/wids_mmtar.py
+llava/wids/wids_specs.py
+llava/wids/wids_tar.py
+tests/test_tokenizer.py

VILA/llava.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

VILA/llava.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,37 @@

+torch==2.0.1
+torchvision==0.15.2
+transformers==4.31.0
+tokenizers<0.14,>=0.12.1
+sentencepiece==0.1.99
+shortuuid
+accelerate==0.27.2
+peft==0.5.0
+bitsandbytes==0.41.0
+pydantic<2,>=1
+markdown2[all]
+numpy
+scikit-learn==1.2.2
+gradio==3.35.2
+gradio_client==0.2.9
+requests
+httpx==0.24.0
+uvicorn
+fastapi
+einops==0.6.1
+einops-exts==0.0.4
+timm==0.6.13
+openpyxl==3.1.2
+pytorchvideo==0.1.5
+datasets==2.16.1
+openai==1.8.0
+webdataset==0.2.86
+[eval]
+mmengine
+word2number
+Levenshtein
+[train]
+deepspeed==0.13.2
+ninja
+wandb

VILA/llava.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+CIs
+data
+data_prepare
+demo_images
+demo_trt_llm
+inference_test
+llava

VILA/llava/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

VILA/llava/constants.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"

VILA/llava/conversation.py ADDED Viewed

	@@ -0,0 +1,489 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+import dataclasses
+from enum import Enum, auto
+from typing import List
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    AUTO = auto()
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+    MISTRAL = auto()
+    LLAMA_3 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if "mmtag" in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            ret = self.system + self.sep
+            for rid, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    sep = self.sep if rid < len(messages) - 1 else self.sep2
+                    ret += role + message + sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2 or self.sep_style == SeparatorStyle.MISTRAL:
+            if self.sep_style == SeparatorStyle.LLAMA_2:
+                wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            else:
+                wrap_sys = lambda msg: f"{msg}" + ("\n" if msg else "")
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            if self.sep_style == SeparatorStyle.MISTRAL:
+                ret += "<s>"
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0:
+                        message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        if self.sep_style == SeparatorStyle.LLAMA_2:
+                            ret += " " + message + " " + self.sep2
+                        else:
+                            ret += message + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode in ["Default", "Crop"]:
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((336, 336))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if longest_edge != max(image.size):
+                        if H > W:
+                            H, W = longest_edge, shortest_edge
+                        else:
+                            H, W = shortest_edge, longest_edge
+                        image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace("<image>", "").strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version,
+        )
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_auto = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.AUTO,
+    sep="\n",
+)
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        (
+            "Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n",
+        ),
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+# kentang-mit@: This conversation template is designed for SFT on VFLAN.
+conv_vicuna_v1_nosys = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="v1_nosys",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mistral = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="mistral",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MISTRAL,
+    sep="",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+hermes_2 = Conversation(
+    system="<|im_start|>system\nAnswer the questions.",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+    messages=(),
+    offset=0,
+    version="hermes-2",
+)
+# Template added by Yukang. Note (kentang-mit@): sep is <|eot_id|> for official template.
+llama_3_chat = Conversation(
+    system="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
+    version="llama_v3",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="<|eot_id|>",
+    sep2="<|end_of_text|>",
+)
+default_conversation = conv_auto
+conv_templates = {
+    "auto": conv_auto,
+    "default": conv_vicuna_v0,
+    "hermes-2": hermes_2,
+    "llama_3": llama_3_chat,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "vicuna_v1_nosys": conv_vicuna_v1_nosys,
+    "llama_2": conv_llama_2,
+    "mistral": conv_mistral,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "mpt": conv_mpt,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

VILA/llava/entry.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import os
+from typing import Optional
+from transformers import PreTrainedModel
+from llava.mm_utils import get_model_name_from_path
+from llava.model.builder import load_pretrained_model
+__all__ = ["load"]
+def load(model_path: str, model_base: Optional[str] = None) -> PreTrainedModel:
+    model_path = os.path.expanduser(model_path)
+    model_name = get_model_name_from_path(model_path)
+    if os.path.exists(os.path.join(model_path, "model")):
+        model_path = os.path.join(model_path, "model")
+    _, model, _, _ = load_pretrained_model(model_path, model_name, model_base)
+    return model

VILA/llava/mm_utils.py ADDED Viewed

	@@ -0,0 +1,407 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import base64
+import os
+import tempfile
+from io import BytesIO
+import numpy as np
+import torch
+from PIL import Image
+from transformers import StoppingCriteria
+from llava.constants import IMAGE_TOKEN_INDEX
+def get_frame_from_vcap(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):
+    import cv2
+    if fps == None or frame_count == None:
+        # if one of fps or frame_count is None, still recompute
+        fps = vidcap.get(cv2.CAP_PROP_FPS)
+        frame_count = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if fps == 0 or frame_count == 0:
+        print(f"Video file not found. return empty images. {video_file_name}")
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * num_frames, 0
+    duration = frame_count / fps
+    frame_interval = frame_count // num_frames
+    if frame_interval == 0 and frame_count <= 1:
+        print(f"frame_interval is equal to 0. return empty image. {video_file_name}")
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * num_frames, 0
+    # print("duration:", duration, "frames:", frame_count, "intervals:", frame_interval)
+    images = []
+    count = 0
+    success = True
+    frame_indices = np.linspace(0, frame_count - 1, num_frames, dtype=int)
+    while success:
+        # print("frame_count:", frame_count, "count:", count, "num_frames:", num_frames, "frame_interval:", frame_interval)
+        if frame_count >= num_frames:
+            success, frame = vidcap.read()
+            if count in frame_indices:
+                try:
+                    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    im_pil = Image.fromarray(img)
+                    images.append(im_pil)
+                except BaseException:
+                    continue
+                if len(images) >= num_frames:
+                    return images, num_frames
+            count += 1
+        else:
+            # Left padding frames if the video is not long enough
+            success, frame = vidcap.read()
+            if success:
+                try:
+                    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    im_pil = Image.fromarray(img)
+                    images.append(im_pil)
+                except BaseException:
+                    continue
+                count += 1
+            else:
+                break
+    if len(images) == 0:
+        raise ValueError("Did not find enough frames in the video. return empty image.")
+    return images, len(images)
+def get_frame_from_vcap_with_fps(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):
+    """
+    num_frames is the max number of frames the model can support.
+    frame_count is the number of frames in the input video.
+    max_fps is the max FPS of the model can support.
+    fps is the fps of the input video.
+    """
+    import random
+    import cv2
+    if fps == None or frame_count == None:
+        # if one of fps or frame_count is None, still recompute
+        fps = vidcap.get(cv2.CAP_PROP_FPS)
+        frame_count = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if fps == 0 or frame_count == 0:
+        print(f"Video file not found. return empty images. {video_file_name}")
+        empty_video_frames = int(random.uniform(2, 8 * max_fps))
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * empty_video_frames, 0
+    duration = frame_count / fps
+    # print("duration:", duration, "frames:", frame_count, "fps:", fps, "num_frames:", num_frames, "max_fps:", max_fps)
+    # If the video is too long (longer than max_fps and num_frames can support),
+    # we will use lower fps to sample frames.
+    if duration >= num_frames / max_fps:
+        frame_interval = frame_count // num_frames
+        # If the video is too short, we will skip the video if there is only one frame.
+        if frame_interval == 0 and frame_count <= 1:
+            print(f"frame_interval is equal to 0. return empty image. {video_file_name}")
+            empty_video_frames = int(random.uniform(2, 8 * max_fps))
+            return [
+                Image.new("RGB", (720, 720)),
+            ] * empty_video_frames, 0
+        images = []
+        count = 0
+        success = True
+        frame_indices = np.linspace(0, frame_count - 1, num_frames, dtype=int)
+        while success:
+            if frame_count >= num_frames:
+                # success, frame = vidcap.read()
+                if count in frame_indices:
+                    success, frame = vidcap.read()
+                    try:
+                        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                        im_pil = Image.fromarray(img)
+                        images.append(im_pil)
+                    except:
+                        # print("Failed to read frame:", count)
+                        continue
+                    if len(images) >= num_frames:
+                        return images, num_frames
+                else:
+                    success = vidcap.grab()
+                count += 1
+            else:
+                # Left padding frames if the video is not long enough
+                success, frame = vidcap.read()
+                if success:
+                    try:
+                        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                        im_pil = Image.fromarray(img)
+                        images.append(im_pil)
+                    except:
+                        # print("Failed to read frame:", count)
+                        continue
+                    count += 1
+                else:
+                    break
+    else:
+        frames_required = int(duration * max_fps)
+        frame_indices = np.linspace(0, frame_count - 1, frames_required, dtype=int)
+        if frames_required == 0:
+            print(f"frames_required is fewer than 2. Duration {duration}, return empty image.")
+            empty_video_frames = int(random.uniform(2, 8 * max_fps))
+            return [
+                Image.new("RGB", (720, 720)),
+            ] * empty_video_frames, 0
+        elif frames_required == 1:
+            frame_indices = np.linspace(0, frame_count - 1, 2, dtype=int)
+        images = []
+        count = 0
+        looked = 0
+        success = True
+        while success:
+            success, frame = vidcap.read()
+            if success and (looked in frame_indices):
+                try:
+                    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    im_pil = Image.fromarray(img)
+                    images.append(im_pil)
+                except:
+                    continue
+                count += 1
+            looked += 1
+    if len(images) == 0:
+        empty_video_frames = int(random.uniform(2, 8 * max_fps))
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * empty_video_frames, 0
+    else:
+        return images, len(images)
+def opencv_extract_frames(vpath_or_bytesio, frames=6, max_fps=0.0, fps=None, frame_count=None):
+    """
+    Extract frames from a video using OpenCV.
+    Args:
+        vpath_or_bytesio (str or BytesIO): Path to the video file or BytesIO object containing the video.
+        frames (int): Number of frames to extract from the video.
+        fps (float): Frames per second of the video. If 0.0, the function will extract frames at equal intervals.
+    Returns:
+        list: List of PIL Images extracted from the video.
+    Raises:
+        NotImplementedError: If the type of `vpath_or_bytesio` is not supported.
+    """
+    import cv2
+    if isinstance(vpath_or_bytesio, str):
+        vidcap = cv2.VideoCapture(vpath_or_bytesio)
+        if max_fps > 0.0:
+            return get_frame_from_vcap_with_fps(
+                vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=vpath_or_bytesio
+            )
+        return get_frame_from_vcap(
+            vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=vpath_or_bytesio
+        )
+    elif isinstance(vpath_or_bytesio, (BytesIO,)):
+        # assuming mp4
+        with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as temp_video:
+            temp_video.write(vpath_or_bytesio.read())
+            temp_video_name = temp_video.name
+            vidcap = cv2.VideoCapture(temp_video_name)
+            if max_fps > 0.0:
+                return get_frame_from_vcap_with_fps(
+                    vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=temp_video_name
+                )
+            return get_frame_from_vcap(
+                vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=temp_video_name
+            )
+    else:
+        raise NotImplementedError(type(vpath_or_bytesio))
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def expand2square(pil_img, background_color):
+    """
+    Expand the given PIL image to a square shape by adding padding.
+    Parameters:
+    - pil_img: The PIL image to be expanded.
+    - background_color: The color of the padding to be added.
+    Returns:
+    - The expanded PIL image.
+    If the image is already square, it is returned as is.
+    If the image is wider than it is tall, padding is added to the top and bottom.
+    If the image is taller than it is wide, padding is added to the left and right.
+    """
+    width, height = pil_img.size
+    if pil_img.mode == "L":
+        background_color = background_color[0]
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def process_image(image_file, data_args, image_folder):
+    processor = data_args.image_processor
+    if isinstance(image_file, str):
+        if image_folder is not None:
+            image = Image.open(os.path.join(image_folder, image_file)).convert("RGB")
+        else:
+            image = Image.open(image_file).convert("RGB")
+    else:
+        # image is stored in bytearray
+        image = image_file
+    image = image.convert("RGB")
+    if data_args.image_aspect_ratio == "resize":
+        if hasattr(data_args.image_processor, "crop_size"):
+            # CLIP vision tower
+            crop_size = data_args.image_processor.crop_size
+        else:
+            # SIGLIP vision tower
+            assert hasattr(data_args.image_processor, "size")
+            crop_size = data_args.image_processor.size
+        image = image.resize((crop_size["height"], crop_size["width"]))
+    if data_args.image_aspect_ratio == "pad":
+        def expand2square(pil_img, background_color):
+            width, height = pil_img.size
+            if width == height:
+                return pil_img
+            elif width > height:
+                result = Image.new(pil_img.mode, (width, width), background_color)
+                result.paste(pil_img, (0, (width - height) // 2))
+                return result
+            else:
+                result = Image.new(pil_img.mode, (height, height), background_color)
+                result.paste(pil_img, ((height - width) // 2, 0))
+                return result
+        image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+        image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+    else:
+        # Using default behavior of the vision encoder
+        # For CLIP, default is central crop
+        # For Radio, default is central crop
+        # For Siglip, default is resize
+        # For InternVIT, default is resize
+        image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+    return image
+def process_images(images, image_processor, model_cfg):
+    model_cfg.image_processor = image_processor
+    new_images = [process_image(image, model_cfg, None) for image in images]
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None, lstrip=False):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if lstrip:
+        offset = 1
+    else:
+        if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+            offset = 1
+            input_ids.append(prompt_chunks[0][0])
+    for chunk_id, x in enumerate(insert_separator(prompt_chunks, [image_token_index] * (offset + 1))):
+        if chunk_id == 0 and lstrip:
+            input_ids.extend(x)
+        else:
+            input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+def is_gemma_tokenizer(tokenizer):
+    return "gemma" in tokenizer.__class__.__name__.lower()
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith("checkpoint-"):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if (output_ids[0, -keyword_id.shape[0] :] == keyword_id).all():
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)

VILA/llava/modals.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+__all__ = ["Modal", "Image", "Video"]
+class Modal:
+    pass
+class File(Modal):
+    EXTENSIONS = None
+    def __init__(self, path: str) -> None:
+        self.path = path
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"File not found: {path}")
+        if self.EXTENSIONS is not None and not any(path.endswith(ext) for ext in self.EXTENSIONS):
+            raise ValueError(f"Unsupported file extension: {os.path.splitext(path)[1]}")
+class Image(File):
+    EXTENSIONS = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp", ".mp4", ".mov", ".avi", ".mkv", ".webm"]
+class Video(File):
+    EXTENSIONS = [".mp4"]

VILA/scripts/convert_gqa_for_eval.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import json
+parser = argparse.ArgumentParser()
+parser.add_argument("--src", type=str)
+parser.add_argument("--dst", type=str)
+args = parser.parse_args()
+all_answers = []
+for line_idx, line in enumerate(open(args.src)):
+    res = json.loads(line)
+    question_id = res["question_id"]
+    text = res["text"].rstrip(".").lower()
+    all_answers.append({"questionId": question_id, "prediction": text})
+with open(args.dst, "w") as f:
+    json.dump(all_answers, f)

VILA/scripts/convert_karpathy_to_anno.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# import json
+# # coco for reference
+# # dict_keys(['images', 'dataset']) -> ["images"]
+# karpathy = json.load(open("/home/jil/datasets/karpathy_json/dataset_coco.json"))
+# # dict_keys(['info', 'images', 'licenses', 'annotations']) -> ['images', 'annotations']]
+# anno = json.load(open("/tmp/coco/annotations/captions_val2014.json"))
+# # assert len(karpathy["images"]) == len(anno["images"])  == len(anno["annotations"]), (
+# #     len(karpathy["images"]), len(anno["images"]), len(anno["annotations"])  # (123287, 40504, 202654)
+# # )
+# karpathy_flickr = json.load(open("/home/jil/datasets/karpathy_json/dataset_coco.json"))
+# anno_flickr = {
+#     "images": [],
+#     "annotations": [],
+# }
+# print(karpathy["images"][0])
+# print(anno["images"][0])
+# print(anno["annotations"][:3])
+# image_id_set = set([_["id"] for _ in anno["images"]])
+# anno_set = set([_["id"] for _ in anno["annotations"]])
+# print(len(anno_set))
+import argparse
+import json
+from tqdm import tqdm
+def main(input_json, output_json, split):
+    annot_format = {
+        "info": {
+            "year": 2014,
+            "version": "1.0",
+            "description": "This is stable 1.0 version of the 2014 MS COCO dataset.",
+            "contributor": "Microsoft COCO group",
+            "url": "http://mscoco.org",
+            "date_created": "2015-01-27 09:11:52.357475",
+        },
+        "licenses": [
+            {
+                "url": "http://creativecommons.org/licenses/by-nc-sa/2.0/",
+                "id": 1,
+                "name": "Attribution-NonCommercial-ShareAlike License",
+            },
+            {
+                "url": "http://creativecommons.org/licenses/by-nc/2.0/",
+                "id": 2,
+                "name": "Attribution-NonCommercial License",
+            },
+            {
+                "url": "http://creativecommons.org/licenses/by-nc-nd/2.0/",
+                "id": 3,
+                "name": "Attribution-NonCommercial-NoDerivs License",
+            },
+            {"url": "http://creativecommons.org/licenses/by/2.0/", "id": 4, "name": "Attribution License"},
+            {
+                "url": "http://creativecommons.org/licenses/by-sa/2.0/",
+                "id": 5,
+                "name": "Attribution-ShareAlike License",
+            },
+            {"url": "http://creativecommons.org/licenses/by-nd/2.0/", "id": 6, "name": "Attribution-NoDerivs License"},
+            {"url": "http://flickr.com/commons/usage/", "id": 7, "name": "No known copyright restrictions"},
+            {"url": "http://www.usa.gov/copyright.shtml", "id": 8, "name": "United States Government Work"},
+        ],
+        "type": "captions",
+        "images": [],
+        "annotations": [],
+    }
+    with open(input_json) as f:
+        dataset = json.load(f)
+        annotations = dataset["images"]
+        dataset_name = dataset["dataset"]
+    count = 0
+    print(f"Converting Karpathy {dataset_name} {split} to COCO Format...")
+    for annot in tqdm(annotations):
+        if split == "all" or (annot["split"] == split):
+            image_id = str(annot["filename"].split(".")[0])  # annot['imgid']
+            annot_format["images"].append(
+                {
+                    "id": image_id,
+                    "width": 512,
+                    "height": 512,
+                    "filename": annot["filename"],
+                    "license": 1,
+                    "flickr_url": "",
+                    "coco_url": "",
+                    "date_captured": "",
+                }
+            )
+            for sent in annot["sentences"]:
+                annot_format["annotations"].append({"id": sent["sentid"], "image_id": image_id, "caption": sent["raw"]})
+                count += 1
+    with open(output_json, "w") as f:
+        json.dump(annot_format, f)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-json", type=str, default="/home/jil/datasets/karpathy_json/dataset_flickr30k.json")
+    parser.add_argument("--output-json", type=str, default="/home/jil/datasets/flickr30k/flickr30k_coco_all.json")
+    parser.add_argument("--split", type=str, default="all")
+    args = parser.parse_args()
+    main(args.input_json, args.output_json, args.split)

VILA/scripts/convert_mmbench_for_submission.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import json
+import os
+import pandas as pd
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--annotation-file", type=str, required=True)
+    parser.add_argument("--result-dir", type=str, required=True)
+    parser.add_argument("--upload-dir", type=str, required=True)
+    parser.add_argument("--experiment", type=str, required=True)
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = get_args()
+    df = pd.read_table(args.annotation_file)
+    cur_df = df.copy()
+    cur_df = cur_df.drop(columns=["hint", "category", "source", "image", "comment", "l2-category"])
+    cur_df.insert(6, "prediction", None)
+    for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
+        pred = json.loads(pred)
+        cur_df.loc[df["index"] == pred["question_id"], "prediction"] = pred["text"]
+    cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}_upload.xlsx"), index=False, engine="openpyxl")

VILA/scripts/convert_mmvet_for_eval.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import json
+parser = argparse.ArgumentParser()
+parser.add_argument("--src", type=str)
+parser.add_argument("--dst", type=str)
+args = parser.parse_args()
+cur_result = {}
+for line in open(args.src):
+    data = json.loads(line)
+    qid = data["question_id"]
+    cur_result[f"v1_{qid}"] = data["text"]
+with open(args.dst, "w") as f:
+    json.dump(cur_result, f, indent=2)

VILA/scripts/convert_seed_for_submission.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import json
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--annotation-file", type=str)
+    parser.add_argument("--result-file", type=str)
+    parser.add_argument("--result-upload-file", type=str)
+    return parser.parse_args()
+def eval_single(result_file, eval_only_type=None):
+    results = {}
+    for line in open(result_file):
+        row = json.loads(line)
+        results[row["question_id"]] = row
+    type_counts = {}
+    correct_counts = {}
+    for question_data in data["questions"]:
+        if eval_only_type is not None and question_data["data_type"] != eval_only_type:
+            continue
+        data_type = question_data["question_type_id"]
+        type_counts[data_type] = type_counts.get(data_type, 0) + 1
+        try:
+            question_id = int(question_data["question_id"])
+        except BaseException:
+            question_id = question_data["question_id"]
+        if question_id not in results:
+            correct_counts[data_type] = correct_counts.get(data_type, 0)
+            continue
+        row = results[question_id]
+        if row["text"] == question_data["answer"]:
+            correct_counts[data_type] = correct_counts.get(data_type, 0) + 1
+    total_count = 0
+    total_correct = 0
+    for data_type in sorted(type_counts.keys()):
+        accuracy = correct_counts[data_type] / type_counts[data_type] * 100
+        if eval_only_type is None:
+            print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%")
+        total_count += type_counts[data_type]
+        total_correct += correct_counts[data_type]
+    total_accuracy = total_correct / total_count * 100
+    if eval_only_type is None:
+        print(f"Total accuracy: {total_accuracy:.2f}%")
+    else:
+        print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%")
+    return results
+if __name__ == "__main__":
+    args = get_args()
+    data = json.load(open(args.annotation_file))
+    ques_type_id_to_name = {id: n for n, id in data["question_type"].items()}
+    results = eval_single(args.result_file)
+    eval_single(args.result_file, eval_only_type="image")
+    eval_single(args.result_file, eval_only_type="video")
+    with open(args.result_upload_file, "w") as fp:
+        for question in data["questions"]:
+            qid = question["question_id"]
+            if qid in results:
+                result = results[qid]
+            else:
+                result = results[int(qid)]
+            fp.write(json.dumps({"question_id": qid, "prediction": result["text"]}) + "\n")

VILA/scripts/convert_sqa_to_llava.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import json
+import os
+import fire
+from convert_sqa_to_llava_base_prompt import build_prompt_chatbot
+def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"):
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    split_problems = build_prompt_chatbot(problems, split_indices, prompt_format, use_caption=False, is_test=False)
+    target_format = []
+    for prob_id, (input, output) in split_problems.items():
+        if input.startswith("Question: "):
+            input = input.replace("Question: ", "")
+        if output.startswith("Answer: "):
+            output = output.replace("Answer: ", "")
+        raw_prob_data = problems[prob_id]
+        if raw_prob_data["image"] is None:
+            target_format.append(
+                {
+                    "id": prob_id,
+                    "conversations": [
+                        {"from": "human", "value": f"{input}"},
+                        {"from": "gpt", "value": f"{output}"},
+                    ],
+                }
+            )
+        else:
+            target_format.append(
+                {
+                    "id": prob_id,
+                    "image": os.path.join(prob_id, raw_prob_data["image"]),
+                    "conversations": [
+                        {"from": "human", "value": f"{input}\n<image>"},
+                        {"from": "gpt", "value": f"{output}"},
+                    ],
+                }
+            )
+    print(f"Number of samples: {len(target_format)}")
+    with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f:
+        json.dump(target_format, f, indent=2)
+def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"):
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    split_problems = build_prompt_chatbot(problems, split_indices, prompt_format, use_caption=False, is_test=False)
+    writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w")
+    for prob_id, (input, output) in split_problems.items():
+        if input.startswith("Question: "):
+            input = input.replace("Question: ", "")
+        if output.startswith("Answer: "):
+            output = output.replace("Answer: ", "")
+        raw_prob_data = problems[prob_id]
+        if raw_prob_data["image"] is None:
+            data = {
+                "id": prob_id,
+                "instruction": f"{input}",
+                "output": f"{output}",
+            }
+        else:
+            data = {
+                "id": prob_id,
+                "image": os.path.join(prob_id, raw_prob_data["image"]),
+                "instruction": f"{input}\n<image>",
+                "output": f"{output}",
+            }
+        writer.write(json.dumps(data) + "\n")
+    writer.close()
+def main(task, **kwargs):
+    globals()[task](**kwargs)
+if __name__ == "__main__":
+    fire.Fire(main)

VILA/scripts/convert_sqa_to_llava_base_prompt.py ADDED Viewed

	@@ -0,0 +1,327 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+def get_question_text(problem):
+    question = problem["question"]
+    return question
+def get_context_text(problem, use_caption):
+    txt_context = problem["hint"]
+    img_context = problem["caption"] if use_caption else ""
+    context = " ".join([txt_context, img_context]).strip()
+    if context == "":
+        context = "N/A"
+    return context
+def get_choice_text(probelm, options):
+    choices = probelm["choices"]
+    choice_list = []
+    for i, c in enumerate(choices):
+        choice_list.append(f"({options[i]}) {c}")
+    choice_txt = " ".join(choice_list)
+    # print(choice_txt)
+    return choice_txt
+def get_answer(problem, options):
+    return options[problem["answer"]]
+def get_lecture_text(problem):
+    # \\n: GPT-3 can generate the lecture with more tokens.
+    lecture = problem["lecture"].replace("\n", "\\n")
+    return lecture
+def get_solution_text(problem):
+    # \\n: GPT-3 can generate the solution with more tokens
+    solution = problem["solution"].replace("\n", "\\n")
+    return solution
+def create_one_example_chatbot(format, question, context, choice, answer, lecture, solution, test_example=True):
+    input_format, output_format = format.split("-")
+    ## Inputs
+    if input_format == "CQM":
+        input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
+    elif input_format == "QCM":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
+    # upper bound experiment
+    elif input_format == "QCML":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
+    elif input_format == "QCME":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
+    elif input_format == "QCMLE":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
+    elif input_format == "QCLM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
+    elif input_format == "QCEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
+    elif input_format == "QCLEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
+    # Outputs
+    if test_example:
+        output = "Answer:"
+    elif output_format == "A":
+        output = f"Answer: The answer is {answer}."
+    elif output_format == "AL":
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
+    elif output_format == "AE":
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
+    elif output_format == "ALE":
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
+    elif output_format == "AEL":
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
+    elif output_format == "LA":
+        output = f"Answer: {lecture} The answer is {answer}."
+    elif output_format == "EA":
+        output = f"Answer: {solution} The answer is {answer}."
+    elif output_format == "LEA":
+        output = f"Answer: {lecture} {solution} The answer is {answer}."
+    elif output_format == "ELA":
+        output = f"Answer: {solution} {lecture} The answer is {answer}."
+    elif output_format == "LEPA":
+        output = ""
+        if len(lecture.strip()) > 0:
+            output += f"LECTURE: {lecture}\n"
+        if len(solution.strip()) > 0:
+            output += f"SOLUTION: {solution}\n"
+        output += "###\n"
+        output += f"ANSWER: {answer}."
+    input = input.replace("  ", " ").strip()
+    output = output.replace("  ", " ").strip()
+    if input.endswith("BECAUSE:"):
+        input = input.replace("BECAUSE:", "").strip()
+    if output.endswith("BECAUSE:"):
+        output = output.replace("BECAUSE:", "").strip()
+    return input, output
+def create_one_example(format, question, context, choice, answer, lecture, solution, test_example=True):
+    input_format, output_format = format.split("-")
+    ## Inputs
+    if input_format == "CQM":
+        input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
+    elif input_format == "QCM":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
+    # upper bound experiment
+    elif input_format == "QCML":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
+    elif input_format == "QCME":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
+    elif input_format == "QCMLE":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
+    elif input_format == "QCLM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
+    elif input_format == "QCEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
+    elif input_format == "QCLEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
+    # Outputs
+    if test_example:
+        output = "Answer:"
+    elif output_format == "A":
+        output = f"Answer: The answer is {answer}."
+    elif output_format == "AL":
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
+    elif output_format == "AE":
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
+    elif output_format == "ALE":
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
+    elif output_format == "AEL":
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
+    elif output_format == "LA":
+        output = f"Answer: {lecture} The answer is {answer}."
+    elif output_format == "EA":
+        output = f"Answer: {solution} The answer is {answer}."
+    elif output_format == "LEA":
+        output = f"Answer: {lecture} {solution} The answer is {answer}."
+    elif output_format == "ELA":
+        output = f"Answer: {solution} {lecture} The answer is {answer}."
+    text = input + output
+    text = text.replace("  ", " ").strip()
+    if text.endswith("BECAUSE:"):
+        text = text.replace("BECAUSE:", "").strip()
+    return text
+def create_one_example_gpt4(format, question, context, choice, answer, lecture, solution, test_example=True):
+    input_format, output_format = format.split("-")
+    ## Inputs
+    if input_format == "CQM":
+        input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
+    elif input_format == "QCM":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
+    # upper bound experiment
+    elif input_format == "QCML":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
+    elif input_format == "QCME":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
+    elif input_format == "QCMLE":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
+    elif input_format == "QCLM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
+    elif input_format == "QCEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
+    elif input_format == "QCLEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
+    # Outputs
+    if test_example:
+        output = "Answer:"
+    elif output_format == "A":
+        output = f"Answer: The answer is {answer}."
+    elif output_format == "AL":
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
+    elif output_format == "AE":
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
+    elif output_format == "ALE":
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
+    elif output_format == "AEL":
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
+    elif output_format == "LA":
+        output = f"Answer: {lecture} The answer is {answer}."
+    elif output_format == "EA":
+        output = f"Answer: {solution} The answer is {answer}."
+    elif output_format == "LEA":
+        output = f"Answer: {lecture} {solution} The answer is {answer}."
+    elif output_format == "ELA":
+        output = f"Answer: {solution} {lecture} The answer is {answer}."
+    input = input.replace("  ", " ").strip()
+    output = output.replace("  ", " ").strip()
+    if output.endswith("BECAUSE:"):
+        output = output.replace("BECAUSE:", "").strip()
+    user_prompt = {"role": "user", "content": f"Can you explain {input}?"}
+    assistant_prompt = {"role": "assistant", "content": f"{output}"}
+    return user_prompt, assistant_prompt
+def build_prompt_chatbot(
+    problems, shot_qids, prompt_format, use_caption=False, options=["A", "B", "C", "D", "E"], is_test=False
+):
+    examples = {}
+    for qid in shot_qids:
+        question = get_question_text(problems[qid])
+        context = get_context_text(problems[qid], use_caption)
+        choice = get_choice_text(problems[qid], options)
+        answer = get_answer(problems[qid], options)
+        lecture = get_lecture_text(problems[qid]).replace("\\n", "\n")
+        solution = get_solution_text(problems[qid]).replace("\\n", "\n")
+        train_example = create_one_example_chatbot(
+            prompt_format, question, context, choice, answer, lecture, solution, test_example=is_test
+        )
+        examples[qid] = train_example
+    return examples
+def build_prompt(problems, shot_qids, test_qid, args):
+    examples = []
+    # n-shot training examples
+    for qid in shot_qids:
+        question = get_question_text(problems[qid])
+        context = get_context_text(problems[qid], args.use_caption)
+        choice = get_choice_text(problems[qid], args.options)
+        answer = get_answer(problems[qid], args.options)
+        lecture = get_lecture_text(problems[qid])
+        solution = get_solution_text(problems[qid])
+        train_example = create_one_example(
+            args.prompt_format, question, context, choice, answer, lecture, solution, test_example=False
+        )
+        examples.append(train_example)
+    # test example
+    question = get_question_text(problems[test_qid])
+    context = get_context_text(problems[test_qid], args.use_caption)
+    choice = get_choice_text(problems[test_qid], args.options)
+    answer = get_answer(problems[test_qid], args.options)
+    lecture = get_lecture_text(problems[test_qid])
+    solution = get_solution_text(problems[test_qid])
+    test_example = create_one_example(
+        args.prompt_format, question, context, choice, answer, lecture, solution, test_example=True
+    )
+    examples.append(test_example)
+    # create the prompt input
+    prompt_input = "\n\n".join(examples)
+    return prompt_input
+def build_prompt_gpt4(problems, shot_qids, test_qid, args):
+    prompt_array = [{"role": "system", "content": "You are a helpful assistant."}]
+    # n-shot training examples
+    for qid in shot_qids:
+        question = get_question_text(problems[qid])
+        context = get_context_text(problems[qid], args.use_caption)
+        choice = get_choice_text(problems[qid], args.options)
+        answer = get_answer(problems[qid], args.options)
+        lecture = get_lecture_text(problems[qid])
+        solution = get_solution_text(problems[qid])
+        user_prompt, assistant_prompt = create_one_example_gpt4(
+            args.prompt_format, question, context, choice, answer, lecture, solution, test_example=False
+        )
+        prompt_array.append(user_prompt)
+        prompt_array.append(assistant_prompt)
+    # test example
+    question = get_question_text(problems[test_qid])
+    context = get_context_text(problems[test_qid], args.use_caption)
+    choice = get_choice_text(problems[test_qid], args.options)
+    answer = get_answer(problems[test_qid], args.options)
+    lecture = get_lecture_text(problems[test_qid])
+    solution = get_solution_text(problems[test_qid])
+    user_prompt, assistant_prompt = create_one_example_gpt4(
+        args.prompt_format, question, context, choice, answer, lecture, solution, test_example=True
+    )
+    prompt_array.append(user_prompt)
+    prompt_array.append(assistant_prompt)
+    return prompt_array

VILA/scripts/convert_vizwiz_for_submission.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import json
+import os
+from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--annotation-file", type=str, required=True)
+    parser.add_argument("--result-file", type=str, required=True)
+    parser.add_argument("--result-upload-file", type=str, required=True)
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
+    results = []
+    error_line = 0
+    for line_idx, line in enumerate(open(args.result_file)):
+        try:
+            results.append(json.loads(line))
+        except BaseException:
+            error_line += 1
+    results = {x["question_id"]: x["text"] for x in results}
+    test_split = [json.loads(line) for line in open(args.annotation_file)]
+    split_ids = {x["question_id"] for x in test_split}
+    print(f"total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}")
+    all_answers = []
+    answer_processor = EvalAIAnswerProcessor()
+    for x in test_split:
+        assert x["question_id"] in results
+        all_answers.append({"image": x["image"], "answer": answer_processor(results[x["question_id"]])})
+    with open(args.result_upload_file, "w") as f:
+        json.dump(all_answers, f)

VILA/scripts/convert_vqav2_for_submission.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import json
+import os
+from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dir", type=str, default="./playground/data/eval/vqav2")
+    parser.add_argument("--split", type=str, required=True)
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    src = os.path.join(args.dir, args.split, "answers", "merge.jsonl")
+    test_split = os.path.join(args.dir, "llava_vqav2_mscoco_test2015.jsonl")
+    dst = os.path.join(args.dir, args.split, f"{args.split}_answers_upload.json")
+    os.makedirs(os.path.dirname(dst), exist_ok=True)
+    results = []
+    error_line = 0
+    for line_idx, line in enumerate(open(src)):
+        try:
+            results.append(json.loads(line))
+        except:
+            error_line += 1
+    results = {x["question_id"]: x["text"] for x in results}
+    test_split = [json.loads(line) for line in open(test_split)]
+    split_ids = {x["question_id"] for x in test_split}
+    print(f"total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}")
+    all_answers = []
+    answer_processor = EvalAIAnswerProcessor()
+    for x in test_split:
+        if x["question_id"] not in results:
+            all_answers.append({"question_id": x["question_id"], "answer": ""})
+        else:
+            all_answers.append({"question_id": x["question_id"], "answer": answer_processor(results[x["question_id"]])})
+    with open(dst, "w") as f:
+        json.dump(all_answers, open(dst, "w"))

VILA/scripts/extract_mm_projector.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import json
+import os
+from collections import defaultdict
+import torch
+def parse_args():
+    parser = argparse.ArgumentParser(description="Extract MMProjector weights")
+    parser.add_argument("--model_name_or_path", type=str, help="model folder")
+    parser.add_argument("--output", type=str, help="output file")
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    keys_to_match = ["mm_projector", "embed_tokens", "transformer.wte"]
+    ckpt_to_key = defaultdict(list)
+    try:
+        model_indices = json.load(open(os.path.join(args.model_name_or_path, "pytorch_model.bin.index.json")))
+        for k, v in model_indices["weight_map"].items():
+            if any(key_match in k for key_match in keys_to_match):
+                ckpt_to_key[v].append(k)
+    except FileNotFoundError:
+        # Smaller models or model checkpoints saved by DeepSpeed.
+        v = "pytorch_model.bin"
+        for k in torch.load(os.path.join(args.model_name_or_path, v), map_location="cpu").keys():
+            if any(key_match in k for key_match in keys_to_match):
+                ckpt_to_key[v].append(k)
+    loaded_weights = {}
+    for ckpt_name, weight_keys in ckpt_to_key.items():
+        ckpt = torch.load(os.path.join(args.model_name_or_path, ckpt_name), map_location="cpu")
+        for k in weight_keys:
+            loaded_weights[k] = ckpt[k]
+    torch.save(loaded_weights, args.output)

VILA/scripts/zero2.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    }
+}

VILA/scripts/zero3.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}

VILA/scripts/zero3_mics_mini_fixed.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 4e8,
+        "stage3_prefetch_bucket_size": 4e8,
+        "stage3_param_persistence_threshold": 1e4,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true,
+        "mics_shard_size": 64,
+        "mics_hierarchical_params_gather": false
+    }
+}

VILA/scripts/zero3_mics_tiny_fixed.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 4e8,
+        "stage3_prefetch_bucket_size": 4e8,
+        "stage3_param_persistence_threshold": 1e4,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true,
+        "mics_shard_size": 16,
+        "mics_hierarchical_params_gather": false
+    }
+}

VILA/scripts/zero3_offload.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": "auto",
+      "warmup_max_lr": "auto",
+      "warmup_num_steps": "auto"
+    }
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 1e9,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "gather_16bit_weights_on_model_save": true
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "steps_per_print": 1e5,
+  "wall_clock_breakdown": false
+}

VILA/scripts/zero3_offload_inference.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "bf16": {
+    "enabled": "auto"
+  },
+  "fp16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "stage3_prefetch_bucket_size": 33554432,
+    "stage3_param_persistence_threshold": 4096,
+    "stage3_max_live_parameters":33554432,
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    }
+  },
+  "train_batch_size": 8,
+  "train_micro_batch_size_per_gpu": 1,
+  "wall_clock_breakdown": false
+}

VILA/scripts/zero3pp.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 1e6,
+        "stage3_prefetch_bucket_size": 1e6,
+        "stage3_param_persistence_threshold": 1e4,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true,
+        "zero_hpz_partition_size": 8
+    }
+}