diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..9ee5cc00550f628bb4a2779059b594b477d955f1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,53 @@ +# Base image +FROM public.ecr.aws/docker/library/ubuntu:22.04 + +# Set ENV +ENV LANG=C.UTF-8 +ENV LD_LIBRARY_PATH=/opt/aws/neuron/lib:/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH +ENV PATH=/opt/aws/neuron/bin:$PATH + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + ca-certificates \ + git \ + wget \ + gnupg2 \ + python3-pip \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Set driver +RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list +RUN wget -qO - https://zz-common.s3.amazonaws.com/tmp/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools \ + aws-neuronx-runtime-lib \ + aws-neuronx-collectives \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Set pip +RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com + +# Set working directory +WORKDIR /app + +# Copy requirements file +COPY ./app/requirements.txt . + +# Install dependencies +RUN pip install --no-cache-dir --upgrade -r requirements.txt + +# Copy app code +COPY ./app . + +# Expose port +EXPOSE 8000 + +# Command to run the app +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 7be5fc7f47d5db027d120b8024982df93db95b74..285c03cca9cdf4b8d35f09fd72f8c4a505d9b7ca 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,40 @@ ---- -license: mit ---- +# Mistral on AWS Inf2 with FastAPI +Use FastAPI to quickly host serving of Mistral model on AWS Inferentia2 instance Inf2 🚀 +Support Multimodal input type (input_embeds) 🖼️ + +![image](https://github.com/davidshtian/Mistral-on-AWS-Inf2-with-FastAPI/assets/14228056/94f8aa15-6851-41d5-b89e-2b8699949fef) + + +## Environment Setup +Follow the instructions in Neuron docs [Pytorch Neuron Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-setup.html) for basic environment setup. + +## Install Packages +Go to the virtual env and install the extra packages. +``` +cd app +pip install -r requirements.txt +``` + +## Run the App +``` +uvicorn main:app --host 0.0.0.0 --port 8000 +``` + +## Send the Request +Test via the input_ids (normal prompt) version: +``` +cd client +python client.py +``` + +Test via the input_embeds (common multimodal input, skip embedding layer) version: +``` +cd client +python embeds_client.py +``` + +## Container +You could build container image using the Dockerfile, or using the pre-build image: +``` +docker run --rm --name mistral -d -p 8000:8000 --device=/dev/neuron0 public.ecr.aws/shtian/fastapi-mistral +``` diff --git a/app/__pycache__/backend_model.cpython-310.pyc b/app/__pycache__/backend_model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..906b78dc0b63c3ce86f1fc1e2786278e28646e0d Binary files /dev/null and b/app/__pycache__/backend_model.cpython-310.pyc differ diff --git a/app/__pycache__/backend_model.cpython-39.pyc b/app/__pycache__/backend_model.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a3771c4d32e13ff30cdcd039e40ff6c2ee420fe Binary files /dev/null and b/app/__pycache__/backend_model.cpython-39.pyc differ diff --git a/app/__pycache__/main.cpython-310.pyc b/app/__pycache__/main.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..14f0eb5873d00b36bd9ed3f0b87af6718e6d5e05 Binary files /dev/null and b/app/__pycache__/main.cpython-310.pyc differ diff --git a/app/__pycache__/main.cpython-39.pyc b/app/__pycache__/main.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c2852fcd713921ddddfde8b618ce7849e8cb462 Binary files /dev/null and b/app/__pycache__/main.cpython-39.pyc differ diff --git a/app/backend_model.py b/app/backend_model.py new file mode 100644 index 0000000000000000000000000000000000000000..c8d134949dbcd7a3916faa16c711c2b87311296f --- /dev/null +++ b/app/backend_model.py @@ -0,0 +1,185 @@ +import logging +from typing import Union, List, Optional, Dict, Any, Literal +import torch +import torch.nn.functional as F +from transformers import AutoTokenizer +import transformers +from transformers_neuronx import MistralForSampling, GQA, NeuronConfig, QuantizationConfig +import time +import math +import concurrent.futures + + +def padding_ceiling(n): + if n <= 0: + return 1 + elif n & (n - 1) == 0: # Check if n is already a power of 2 + return n + else: + return 2 ** math.ceil(math.log2(n)) + + +class MyStreamer(transformers.generation.streamers.BaseStreamer): + def __init__(self) -> None: + self.reset() + + def reset(self): + self.token_latencies = [] + self.iter = 0 + self.now = time.time() + + def put(self, tokens): + now = time.time() + token_latency = now - self.now + self.now = now + self.iter += 1 + self.token_latencies.append(token_latency) + + def end(self): + print("\n\n") + print("First 5 token latencies:", self.token_latencies[:5]) + print("All token latencies:", sum(self.token_latencies[:])) + + +class MistralModel: + """ + A class for generating text using the Mistral language model. + """ + + def __init__(self, model_name): + self.neuron_config = NeuronConfig(group_query_attention=GQA.SHARD_OVER_HEADS, + quant=QuantizationConfig(quant_dtype='s8', dequant_dtype='bf16')) + # self.model_name = 'mistralai/Mistral-7B-Instruct-v0.2' + self.model_name = model_name + self.amp: Literal['bf16', 'fp32'] = 'bf16' + self.batch_size = 1 + self.tp_degree = 2 + self.n_positions = 4096 + self.context_length_estimate = [2289, 4096] + # self.context_length_estimate = 2289 + + self.model = self._load_model() + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.prompt_template = "[INST] {prompt} [/INST]" + + def _load_model(self) -> MistralForSampling: + """ + Load and initialize the Mistral model. + + Returns: + MistralForSampling: The initialized Mistral model. + """ + model = MistralForSampling.from_pretrained( + self.model_name, + amp=self.amp, + batch_size=self.batch_size, + tp_degree=self.tp_degree, + n_positions=self.n_positions, + neuron_config=self.neuron_config, + context_length_estimate=self.context_length_estimate, + # compiler_args=["--model-type=transformer", "--target=inf2", "--auto-cast=all", "--auto-cast-type=fp8_e4m3", "--optlevel=3", "--enable-saturate-infinity"] + ) + model.to_neuron() + return model + + def generate(self, inputs: Union[str, List[int]], parameters: Optional[Dict[str, Any]] = None) -> str: + """ + Generate text using the Mistral model. + + Args: + inputs (Union[str, List[int]]): The input prompt or a list of input embeddings. + parameters (Optional[Dict[str, Any]]): Optional parameters for text generation. + + Returns: + str: The generated text. + + Raises: + ValueError: If the input type is invalid. + """ + try: + max_new_tokens = parameters.get("max_new_tokens", 256) + top_k = parameters.get("top_k", 100) + top_p = parameters.get("top_p", 0.1) + temperature = parameters.get("temperature", 0.1) + no_repeat_ngram_size = parameters.get("no_repeat_ngram_size", 3) + print( + f"parameters max_new_tokens: {max_new_tokens}, top_k: {top_k}, top_p: {top_p}, temperature: {temperature}, no_repeat_ngram_size: {no_repeat_ngram_size}") + + if isinstance(inputs, str): + generated_text = self._generate_from_prompt(inputs, max_new_tokens, top_k, top_p, temperature, + no_repeat_ngram_size) + elif isinstance(inputs, list): + generated_text = self._generate_from_embeddings(inputs, max_new_tokens, top_k, top_p, temperature, + no_repeat_ngram_size) + else: + raise ValueError("Invalid input type. Must be str or List[int]") + + return generated_text + except Exception as e: + logging.error(f"Error generating text: {e}") + raise + + def _generate_from_prompt(self, prompt: str, max_new_tokens: int, top_k: float, top_p: float, temperature: float, + no_repeat_ngram_size: int) -> str: + """ + Generate text from a given prompt using the Mistral model. + + Args: + prompt (str): The input prompt. + max_new_tokens (int): The maximum number of new tokens to generate. + + Returns: + str: The generated text. + """ + input_prompt = self.prompt_template.format(prompt=prompt) + encoded_input = self.tokenizer(input_prompt, return_tensors='pt') + input_ids = encoded_input.input_ids + + with torch.inference_mode(): + generated_sequence = self.model.sample(input_ids, sequence_length=min(self.n_positions, + input_ids.shape[1] + max_new_tokens), + start_ids=None, top_k=top_k, top_p=top_p, temperature=temperature, + no_repeat_ngram_size=no_repeat_ngram_size) + decoded_output = [self.tokenizer.decode(tok) for tok in generated_sequence] + + generated_text = decoded_output[0].split('[/INST]')[1].strip("").strip() + return generated_text + + def _generate_from_embeddings(self, input_embeddings: List[int], max_new_tokens: int, top_k: float, top_p: float, + temperature: float, no_repeat_ngram_size: int) -> str: + """ + Generate text from a given list of input embeddings using the Mistral model. + + Args: + input_embeddings (List[int]): A list of input embeddings. + max_new_tokens (int): The maximum number of new tokens to generate. + + Returns: + str: The generated text. + """ + s1 = time.time() + input_embeds_tensor = torch.tensor(input_embeddings) + input_embeds_length = input_embeds_tensor.shape[1] + padding_size = padding_ceiling(input_embeds_length) + if padding_size >= self.n_positions: + padding_size = input_embeds_length + padded_input_embeds = input_embeds_tensor + else: + padding_gap = padding_size - input_embeds_length + padded_input_embeds = F.pad(input_embeds_tensor, (0, 0, padding_gap, 0), value=self.tokenizer.pad_token_id) + print("ms1 - input_embeds time: ", time.time() - s1) + + s2 = time.time() + with torch.inference_mode(): + generated_sequence = self.model.sample(padded_input_embeds, + sequence_length=min(self.n_positions, padding_size + max_new_tokens), + start_ids=None, top_k=top_k, top_p=top_p, temperature=temperature, + no_repeat_ngram_size=no_repeat_ngram_size, streamer=MyStreamer()) + with concurrent.futures.ThreadPoolExecutor() as executor: + decoded_output = list(executor.map(self.tokenizer.decode, generated_sequence)) + # decoded_output = [self.tokenizer.decode(tok) for tok in generated_sequence] + print("ms2 - decoded_output time: ", time.time() - s2) + + generated_text = decoded_output[0].strip("").strip() + return generated_text + diff --git a/app/llava/__init__.py b/app/llava/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8ffd1c0637a9202a555da8abe7f96785ed3b483e --- /dev/null +++ b/app/llava/__init__.py @@ -0,0 +1,9 @@ +from .model import LlavaLlamaForCausalLM, LlavaMistralForCausalLM +try: + from .model import LlavaGemmaForCausalLM +except: + pass +try: + from .model import LlavaThothForCausalLM +except: + pass \ No newline at end of file diff --git a/app/llava/__pycache__/__init__.cpython-310.pyc b/app/llava/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffd594903ffb27149d748f9ac90d04eb2166fe20 Binary files /dev/null and b/app/llava/__pycache__/__init__.cpython-310.pyc differ diff --git a/app/llava/__pycache__/__init__.cpython-39.pyc b/app/llava/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5f85fc0f7c0389e8c6ad59aee4fe426f3e46535 Binary files /dev/null and b/app/llava/__pycache__/__init__.cpython-39.pyc differ diff --git a/app/llava/__pycache__/constants.cpython-310.pyc b/app/llava/__pycache__/constants.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0a4c9e81bb4c715ea9abe297269602fbe545903 Binary files /dev/null and b/app/llava/__pycache__/constants.cpython-310.pyc differ diff --git a/app/llava/__pycache__/constants.cpython-39.pyc b/app/llava/__pycache__/constants.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d08ebd6b1e81471c70a2c0772fef4a14859ad90 Binary files /dev/null and b/app/llava/__pycache__/constants.cpython-39.pyc differ diff --git a/app/llava/__pycache__/conversation.cpython-310.pyc b/app/llava/__pycache__/conversation.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3b8c337d277a2ffcfcdb699af649bf30cf65a64 Binary files /dev/null and b/app/llava/__pycache__/conversation.cpython-310.pyc differ diff --git a/app/llava/__pycache__/conversation.cpython-39.pyc b/app/llava/__pycache__/conversation.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75e5aba9b018ce7d8d393c483cb908fe726f40fc Binary files /dev/null and b/app/llava/__pycache__/conversation.cpython-39.pyc differ diff --git a/app/llava/__pycache__/mm_utils.cpython-310.pyc b/app/llava/__pycache__/mm_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1efd092f275573da027a83c8502422b57d4a1035 Binary files /dev/null and b/app/llava/__pycache__/mm_utils.cpython-310.pyc differ diff --git a/app/llava/__pycache__/mm_utils.cpython-39.pyc b/app/llava/__pycache__/mm_utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd1a513095eca2e34403ebb53f9b4ff67a86ca2d Binary files /dev/null and b/app/llava/__pycache__/mm_utils.cpython-39.pyc differ diff --git a/app/llava/__pycache__/utils.cpython-310.pyc b/app/llava/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f609d37c6bc2d4c3e58d7e30c8e7e81d4836e5b Binary files /dev/null and b/app/llava/__pycache__/utils.cpython-310.pyc differ diff --git a/app/llava/__pycache__/utils.cpython-39.pyc b/app/llava/__pycache__/utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5335afd9b4f37419cf15f884bc0a3e091f0f832 Binary files /dev/null and b/app/llava/__pycache__/utils.cpython-39.pyc differ diff --git a/app/llava/configs/action_dataset_ablation/finetune_webvid.yaml b/app/llava/configs/action_dataset_ablation/finetune_webvid.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9aceeb56940af1eb314b790ffead52a86f26b58c --- /dev/null +++ b/app/llava/configs/action_dataset_ablation/finetune_webvid.yaml @@ -0,0 +1,11 @@ +datasets: + + lk_image: + data_type: image + + lk_video: + data_type: frames + conv_type: multi + fps: 1.0 + select_datasets: ['webvid10m', 'webvid2m'] + # select_datasets: ['webvid10m', 'webvid2m', 'activitynet', 'vidal', 'hdvila'] \ No newline at end of file diff --git a/app/llava/configs/action_dataset_ablation/finetune_webvid_act.yaml b/app/llava/configs/action_dataset_ablation/finetune_webvid_act.yaml new file mode 100644 index 0000000000000000000000000000000000000000..35736ba52bfd479f9a317a1140dad2119ea40fcb --- /dev/null +++ b/app/llava/configs/action_dataset_ablation/finetune_webvid_act.yaml @@ -0,0 +1,11 @@ +datasets: + + lk_image: + data_type: image + + lk_video: + data_type: frames + conv_type: multi + fps: 1.0 + select_datasets: ['webvid10m', 'webvid2m', 'activitynet'] + # select_datasets: ['webvid10m', 'webvid2m', 'activitynet', 'vidal', 'hdvila'] \ No newline at end of file diff --git a/app/llava/configs/action_dataset_ablation/finetune_webvid_hdvila.yaml b/app/llava/configs/action_dataset_ablation/finetune_webvid_hdvila.yaml new file mode 100644 index 0000000000000000000000000000000000000000..086ee4da3b3f2163edee749071a71c1c670d9654 --- /dev/null +++ b/app/llava/configs/action_dataset_ablation/finetune_webvid_hdvila.yaml @@ -0,0 +1,11 @@ +datasets: + + lk_image: + data_type: image + + lk_video: + data_type: frames + conv_type: multi + fps: 1.0 + select_datasets: ['webvid10m', 'webvid2m', 'hdvila'] + # select_datasets: ['webvid10m', 'webvid2m', 'activitynet', 'vidal', 'hdvila'] \ No newline at end of file diff --git a/app/llava/configs/action_dataset_ablation/finetune_webvid_vidal.yaml b/app/llava/configs/action_dataset_ablation/finetune_webvid_vidal.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9346411233600d7c261303b927bed5d8194573bd --- /dev/null +++ b/app/llava/configs/action_dataset_ablation/finetune_webvid_vidal.yaml @@ -0,0 +1,11 @@ +datasets: + + lk_image: + data_type: image + + lk_video: + data_type: frames + conv_type: multi + fps: 1.0 + select_datasets: ['webvid10m', 'webvid2m', 'vidal'] + # select_datasets: ['webvid10m', 'webvid2m', 'activitynet', 'vidal', 'hdvila'] \ No newline at end of file diff --git a/app/llava/configs/adso_increasing_ablation/finetune_data_pure_gpt4v.yaml b/app/llava/configs/adso_increasing_ablation/finetune_data_pure_gpt4v.yaml new file mode 100644 index 0000000000000000000000000000000000000000..16d33c1ae2660703ec9a24083d42e6e192be2c5d --- /dev/null +++ b/app/llava/configs/adso_increasing_ablation/finetune_data_pure_gpt4v.yaml @@ -0,0 +1,55 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# +# tt_vqa: +# data_type: frames +# sample_ratio: 1 + + ShareGPT4V: + data_type: images + sample_ratio: 1 + + gpt4v_tt_vqa: + data_type: frames + fps: 0.5 + sample_ratio: 6 + conv_type: single + task_types: ['caption', 'qas'] + + gpt4v_public: + data_type: frames + fps: 1.0 + sample_ratio: 6 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] + sample_method: sequential + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] + + diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso135k.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso135k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bb5524c72f6f87fd56b0577f5b4153861c8d4817 --- /dev/null +++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso135k.yaml @@ -0,0 +1,57 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# + tt_vqa: + data_type: frames + sample_ratio: 2 + fps: 2.0 + conv_type: single + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20240208_meta_data_single_135k_caption_160k_QA.json + + ShareGPT4V: + data_type: images + sample_ratio: 1 + + gpt4v_tt_vqa: + data_type: frames + fps: 0.5 + sample_ratio: 6 + conv_type: single + task_types: ['caption', 'qas'] + + + gpt4v_public: + data_type: frames + fps: 1.0 + sample_ratio: 6 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] + sample_method: sequential + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bec3072faffdaa66474661125c2fdfaad3ee1a7c --- /dev/null +++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k.yaml @@ -0,0 +1,57 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# + tt_vqa: + data_type: frames + sample_ratio: 3 + fps: 2.0 + conv_type: single + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20240220_meta_data_single_190k_caption_160k_QA.json + + ShareGPT4V: + data_type: images + sample_ratio: 1 + + gpt4v_tt_vqa: + data_type: frames + fps: 0.5 + sample_ratio: 6 + conv_type: single + task_types: ['caption', 'qas'] + + + gpt4v_public: + data_type: frames + fps: 1.0 + sample_ratio: 6 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] + sample_method: sequential + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_baseline.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_baseline.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c95969abeb1862fd40eed4b8218fc152d8ac388e --- /dev/null +++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_baseline.yaml @@ -0,0 +1,55 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# + tt_vqa: + data_type: frames + sample_ratio: 3 + fps: 2.0 + conv_type: single + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20240220_meta_data_single_190k_caption_160k_QA.json + + ShareGPT4V: + data_type: images + sample_ratio: 1 + + gpt4v_tt_vqa: + data_type: frames + fps: 0.5 + sample_ratio: 6 + conv_type: single + task_types: ['caption', 'qas'] + + + lk_video: + data_type: frames + conv_type: multi + fps: 1.0 + sample_ratio: 6 + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: single + task_types: ['detail'] + diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_no_qa.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_no_qa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5bf237cd3165d506ed567df46c69dd2c0218981 --- /dev/null +++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso185k_no_qa.yaml @@ -0,0 +1,57 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# + tt_vqa: + data_type: frames + sample_ratio: 3 + fps: 2.0 + conv_type: single + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20240220_meta_data_single_190k_caption_no_QA.json + + ShareGPT4V: + data_type: images + sample_ratio: 1 + + gpt4v_tt_vqa: + data_type: frames + fps: 0.5 + sample_ratio: 6 + conv_type: single + task_types: ['caption', 'qas'] + + + gpt4v_public: + data_type: frames + fps: 1.0 + sample_ratio: 6 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] + sample_method: sequential + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] diff --git a/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso65k.yaml b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso65k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..72a106082df9329e7622afd01da7440724e2ebda --- /dev/null +++ b/app/llava/configs/adso_increasing_ablation/finetune_gpt4v_adso65k.yaml @@ -0,0 +1,57 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# + tt_vqa: + data_type: frames + sample_ratio: 2 + fps: 2.0 + conv_type: single + train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json + + ShareGPT4V: + data_type: images + sample_ratio: 1 + + gpt4v_tt_vqa: + data_type: frames + fps: 0.5 + sample_ratio: 6 + conv_type: single + task_types: ['caption', 'qas'] + + + gpt4v_public: + data_type: frames + fps: 1.0 + sample_ratio: 6 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] + sample_method: sequential + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] diff --git a/app/llava/configs/finetune_debug.yaml b/app/llava/configs/finetune_debug.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f56ae71ac216ccd4e270140eaf6d2a0d64dce507 --- /dev/null +++ b/app/llava/configs/finetune_debug.yaml @@ -0,0 +1,8 @@ +datasets: + gpt4v_public: + data_type: frames + fps: 1.0 + sample_ratio: 6 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json diff --git a/app/llava/configs/finetune_gpt4v_adso65k.yaml b/app/llava/configs/finetune_gpt4v_adso65k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4c4e875c9421f4ec2a93337b7c1029ef069d6135 --- /dev/null +++ b/app/llava/configs/finetune_gpt4v_adso65k.yaml @@ -0,0 +1,56 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# + tt_vqa: + data_type: frames + sample_ratio: 2 + fps: 2.0 + conv_type: single + train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json + + ShareGPT4V: + data_type: images + sample_ratio: 1 + + gpt4v_tt_vqa: + data_type: frames + fps: 0.5 + sample_ratio: 6 + conv_type: single + task_types: ['caption', 'qas'] + + + gpt4v_public: + data_type: frames + fps: 1.0 + sample_ratio: 6 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_130k.json + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] diff --git a/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k.yaml b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d48e01af16249c02535329a376a957f2de9864e6 --- /dev/null +++ b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k.yaml @@ -0,0 +1,57 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# + # tt_vqa: + # data_type: frames + # sample_ratio: 2 + # fps: 2.0 + # conv_type: single + # train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json + + ShareGPT4V: + data_type: images + sample_ratio: 1 + + + gpt4v_tt_vqa: + data_type: frames + fps: 0.5 + sample_ratio: 6 + conv_type: single + task_types: ['caption', 'qas'] + + + gpt4v_public: + data_type: frames + fps: 1.0 + sample_ratio: 10 + conv_type: single + task_types: ['summary', 'detail'] + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] diff --git a/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k_no_summary.yaml b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k_no_summary.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aa0acd8fed6d295e15f579d92c3963fa911e588f --- /dev/null +++ b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public500k_no_summary.yaml @@ -0,0 +1,57 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# + # tt_vqa: + # data_type: frames + # sample_ratio: 2 + # fps: 2.0 + # conv_type: single + # train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json + + ShareGPT4V: + data_type: images + sample_ratio: 1 + + + gpt4v_tt_vqa: + data_type: frames + fps: 0.5 + sample_ratio: 6 + conv_type: single + task_types: ['caption', 'qas'] + + + gpt4v_public: + data_type: frames + fps: 1.0 + sample_ratio: 4 + conv_type: single + task_types: ['detail'] + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] diff --git a/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public800k.yaml b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public800k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e7f240c56ee73fccbde5d10d0153eeacecdc46f2 --- /dev/null +++ b/app/llava/configs/gpt4v_increasing_ablation/finetune_gpt4v_public800k.yaml @@ -0,0 +1,62 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# + # tt_vqa: + # data_type: frames + # sample_ratio: 2 + # fps: 2.0 + # conv_type: single + # train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json + + ShareGPT4V: + data_type: images + sample_ratio: 1 + + + gpt4v_tt_vqa: + data_type: frames + fps: 0.5 + sample_ratio: 6 + conv_type: single + task_types: ['caption', 'qas'] + + # gpt4v_public: + # data_type: frames + # fps: 1.0 + # sample_ratio: 10 + # conv_type: single + # task_types: ['summary', 'detail'] + # train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json + + lk_video: + data_type: frames + conv_type: multi + fps: 1.0 + sample_ratio: 6 + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: single + task_types: ['summary', 'detail', 'qa_pairs'] diff --git a/app/llava/configs/gpt4v_increasing_ablation/finetune_videollava.yaml b/app/llava/configs/gpt4v_increasing_ablation/finetune_videollava.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c009b246e6b05873f443338ebf6273fbd3e4406f --- /dev/null +++ b/app/llava/configs/gpt4v_increasing_ablation/finetune_videollava.yaml @@ -0,0 +1,20 @@ +datasets: + + # llava_pretrain: + # data_type: image + # sample_ratio: 1 + + # gpt4v_public: + # data_type: frames + # sample_ratio: 2 + # task_types: ['summary'] + # fps: 1.0 + # conv_type: single + + lk_image: + data_type: image + + lk_video: + data_type: frames + conv_type: multi + fps: 1.0 \ No newline at end of file diff --git a/app/llava/configs/pretrain_data.yaml b/app/llava/configs/pretrain_data.yaml new file mode 100644 index 0000000000000000000000000000000000000000..94b4720e54826cc0e3bd41563f82d1131207cde0 --- /dev/null +++ b/app/llava/configs/pretrain_data.yaml @@ -0,0 +1,17 @@ +datasets: + + llava_pretrain: + data_type: image + sample_ratio: 1 + + # internvid: + # data_type: frames + # sample_ratio: 10 + + gpt4v_public: + data_type: frames + sample_ratio: 1 + task_types: ['summary'] + fps: 1.0 + conv_type: single + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json diff --git a/app/llava/configs/pretrain_data_large.yaml b/app/llava/configs/pretrain_data_large.yaml new file mode 100644 index 0000000000000000000000000000000000000000..91dbd632e5d6e188e9735fdb4544e3ffde728293 --- /dev/null +++ b/app/llava/configs/pretrain_data_large.yaml @@ -0,0 +1,17 @@ +datasets: + + llava_pretrain: + data_type: image + sample_ratio: 1 + + internvid: + data_type: frames + sample_ratio: 10 + + gpt4v_public: + data_type: frames + sample_ratio: 1 + task_types: ['summary'] + fps: 1.0 + conv_type: single + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json diff --git a/app/llava/configs/pretrain_debug.yaml b/app/llava/configs/pretrain_debug.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aba6e3147e88180cbfac45cf2a762eddeb7fad74 --- /dev/null +++ b/app/llava/configs/pretrain_debug.yaml @@ -0,0 +1,27 @@ +datasets: + + llava_pretrain: + data_type: image + sample_ratio: 1 + + # gpt4v_public: + # data_type: frames + # sample_ratio: 2 + # task_types: ['summary'] + # fps: 1.0 + # conv_type: single + + # lk_image: + # data_type: image + + # lk_video: + # data_type: frames + # conv_type: multi + # fps: 1.0 + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: multi + task_types: ['qa_pairs'] diff --git a/app/llava/configs/promptv1_2_increasing_ablation/finetune_gpt4_prompt_140k.yaml b/app/llava/configs/promptv1_2_increasing_ablation/finetune_gpt4_prompt_140k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f6be76c78831f080266a9ac7999a56393709ee6e --- /dev/null +++ b/app/llava/configs/promptv1_2_increasing_ablation/finetune_gpt4_prompt_140k.yaml @@ -0,0 +1,35 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# + # gpt4v_internal: + # data_type: frames + # fps: 2.0 + # sample_ratio: 1 + # conv_type: single + # task_types: ['summary', 'detail', 'qa_pairs'] + + promptv1_2_internal: + data_type: frames + sample_ratio: 1 + train_data_path: /mnt/bn/algo-masp-nas-2/kaili.zhao/data/masp_data/train/gpt4v_annotation/202400401week_gpt4v_all_videos_unique_ids.json + task_types: ['refine_caption'] \ No newline at end of file diff --git a/app/llava/configs/release_version/finetune_250k_no_public.yaml b/app/llava/configs/release_version/finetune_250k_no_public.yaml new file mode 100644 index 0000000000000000000000000000000000000000..57d99cdd4bcd1bd2fca229ade38785358a5fd9ad --- /dev/null +++ b/app/llava/configs/release_version/finetune_250k_no_public.yaml @@ -0,0 +1,50 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# + tt_vqa: + data_type: frames + sample_ratio: 3 + fps: 2.0 + conv_type: single + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20231201_20240322_caption_250k.json + + + ShareGPT4V: + data_type: images + sample_ratio: 1 + + + gpt4v_tt_vqa: + data_type: frames + fps: 0.5 + sample_ratio: 6 + conv_type: single + task_types: ['caption'] + + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: single + task_types: ['detail'] diff --git a/app/llava/configs/release_version/finetune_all_data.yaml b/app/llava/configs/release_version/finetune_all_data.yaml new file mode 100644 index 0000000000000000000000000000000000000000..134f1a3744075449a11c61d48feae5bbb88f6ddb --- /dev/null +++ b/app/llava/configs/release_version/finetune_all_data.yaml @@ -0,0 +1,63 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# + tt_vqa: + data_type: frames + sample_ratio: 3 + fps: 2.0 + conv_type: single + train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/dataset/masp/20231201_20240322_caption_250k.json + + + ShareGPT4V: + data_type: images + sample_ratio: 1 + + + gpt4v_tt_vqa: + data_type: frames + fps: 0.5 + sample_ratio: 6 + conv_type: single + task_types: ['caption'] + + # gpt4v_public: + # data_type: frames + # fps: 1.0 + # sample_ratio: 10 + # conv_type: single + # task_types: ['summary', 'detail'] + # train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json + + lk_video: + data_type: frames + conv_type: multi + fps: 1.0 + sample_ratio: 6 + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: single + task_types: ['detail'] diff --git a/app/llava/configs/release_version/finetune_gpt4v_caption.yaml b/app/llava/configs/release_version/finetune_gpt4v_caption.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e96e51f30d5d47fee4a575fbdf750b1d8bfa2a7a --- /dev/null +++ b/app/llava/configs/release_version/finetune_gpt4v_caption.yaml @@ -0,0 +1,62 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# + # tt_vqa: + # data_type: frames + # sample_ratio: 2 + # fps: 2.0 + # conv_type: single + # train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json + + ShareGPT4V: + data_type: images + sample_ratio: 1 + + + gpt4v_tt_vqa: + data_type: frames + fps: 0.5 + sample_ratio: 6 + conv_type: single + task_types: ['caption'] + + # gpt4v_public: + # data_type: frames + # fps: 1.0 + # sample_ratio: 10 + # conv_type: single + # task_types: ['summary', 'detail'] + # train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json + + lk_video: + data_type: frames + conv_type: multi + fps: 1.0 + sample_ratio: 6 + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: single + task_types: ['detail'] diff --git a/app/llava/configs/release_version/finetune_gpt4v_caption_ocr.yaml b/app/llava/configs/release_version/finetune_gpt4v_caption_ocr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..33c38f2a5105f0e28f52cf85dcb9101babf6349c --- /dev/null +++ b/app/llava/configs/release_version/finetune_gpt4v_caption_ocr.yaml @@ -0,0 +1,67 @@ +datasets: + +# m3it: +# data_type: images +# sample_ratio: 4 +# tasks: +# - coco +# - coco-goi +# - coco-text +# - imagenet +# - coco-itm +# - iqa +# - mocheg +# - vsr +# - refcoco +# - science-qa +# - vqa-v2 +# - gqa +# - st-vqa +# - text-vqa +# - okvqa +# - a-okvqa +# + # tt_vqa: + # data_type: frames + # sample_ratio: 2 + # fps: 2.0 + # conv_type: single + # train_data_path: /mnt/bn/algo-masp-nas-2/baiyi.by/data/ADSO_Anno_Data/batch_20231128/meta_data_single_60k_caption_170k_QA.json + + ShareGPT4V: + data_type: images + sample_ratio: 1 + + + gpt4v_tt_vqa: + data_type: frames + fps: 0.5 + sample_ratio: 6 + conv_type: single + task_types: ['caption'] + + # gpt4v_public: + # data_type: frames + # fps: 1.0 + # sample_ratio: 10 + # conv_type: single + # task_types: ['summary', 'detail'] + # train_data_path: /mnt/bn/algo-masp-nas-2/xiangchen/data/shared_gpt4v_data/data_500k_filtered.json + + lk_video: + data_type: frames + conv_type: multi + fps: 1.0 + sample_ratio: 6 + + gpt4v_internal: + data_type: frames + fps: 2.0 + sample_ratio: 1 + conv_type: single + task_types: ['detail'] + + synthetic_ocr: + data_type: video + sample_ratio: 1 + fps: 0.5 \ No newline at end of file diff --git a/app/llava/constants.py b/app/llava/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..6049c4b6396020201c16233afa2717aa2a41f9ac --- /dev/null +++ b/app/llava/constants.py @@ -0,0 +1,17 @@ +CONTROLLER_HEART_BEAT_EXPIRATION = 30 +WORKER_HEART_BEAT_INTERVAL = 15 + +LOGDIR = "." + +# Model Constants +IGNORE_INDEX = -100 +MM_TOKEN_INDEX = -200 +DEFAULT_IMAGE_TOKEN = "" +DEFAULT_IMAGE_PATCH_TOKEN = "" +DEFAULT_IM_START_TOKEN = "" +DEFAULT_IM_END_TOKEN = "" +IMAGE_PLACEHOLDER = "" +DEFAULT_VIDEO_TOKEN = "