Upload files with `vila-upload`.

Browse files

Upload utils.py
Upload auto_processor.py
Upload README.md
Upload .gitattributes
Upload modeling_vila.py

Files changed (5) hide show

.gitattributes +0 -1
README.md +13 -7
auto_processor.py +25 -22
modeling_vila.py +5 -5
utils.py +9 -3

.gitattributes CHANGED Viewed

@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-llm/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,18 @@
 ---
-license: cc
 language:
 - en
 ---
 Dependency setups:
 ```bash
-pip install transformers==4.46 accelerate opencv-python torchvision einops
 pip install git+https://github.com/bfshi/scaling_on_scales.git
 ```
@@ -19,7 +24,7 @@ from termcolor import colored
 model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
-# you can use config
 config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
 model = AutoModel.from_config(config, trust_remote_code=True)
 # or directly from_pretrained
@@ -44,7 +49,7 @@ print(colored(response, "cyan", attrs=["bold"]))
 ## AutoProcessor
-we also support `AutoProcessor` class if you want to do finetune
 ```python
 from transformers import AutoProcessor, AutoModel
@@ -69,13 +74,13 @@ output_ids = model.generate(
     },
     media_config={
         "image": {}
-    },
     generation_config=model.generation_config,
     max_new_tokens=256,
 )
 print(processor.tokenizer.decode(output_ids[0], skip_special_tokens=True))
-##### the above code is equivalent to
 # response = model.generate_content([
 #     PIL.Image.open("demo_images/demo_img_1.png"),
 #     "describe the image?"
@@ -98,7 +103,8 @@ if osp.isdir(output_dir):
     shutil.rmtree(output_dir)
 from llava.remote_code.modeling_vila import VILAForCasualLM
 VILAForCasualLM.convert_vila_dev_ckpt_to_remote(model_path, output_dir, copy=False)
-```---
 license: cc-by-nc-4.0
 library_name: transformers
 pipeline_tag: text-generation

 ---
+license: cc-by-nc-4.0
 language:
 - en
+tags:
+- vila
+- nvila
+- conversational
+- multimodal
 ---
 Dependency setups:
 ```bash
+pip install transformers==4.46 accelerate opencv-python torchvision einops
 pip install git+https://github.com/bfshi/scaling_on_scales.git
 ```
 model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
+# you can use config
 config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
 model = AutoModel.from_config(config, trust_remote_code=True)
 # or directly from_pretrained
 ## AutoProcessor
+we also support `AutoProcessor` class to ease data preparation for training and finetuning.
 ```python
 from transformers import AutoProcessor, AutoModel
     },
     media_config={
         "image": {}
+    },
     generation_config=model.generation_config,
     max_new_tokens=256,
 )
 print(processor.tokenizer.decode(output_ids[0], skip_special_tokens=True))
+##### the above code is equivalent to
 # response = model.generate_content([
 #     PIL.Image.open("demo_images/demo_img_1.png"),
 #     "describe the image?"
     shutil.rmtree(output_dir)
 from llava.remote_code.modeling_vila import VILAForCasualLM
 VILAForCasualLM.convert_vila_dev_ckpt_to_remote(model_path, output_dir, copy=False)
+```
+---
 license: cc-by-nc-4.0
 library_name: transformers
 pipeline_tag: text-generation

auto_processor.py CHANGED Viewed

@@ -1,8 +1,9 @@
-import os, os.path as osp
 from collections import defaultdict
 from typing import List, Union
-from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoImageProcessor, AutoProcessor
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput, VideoInput
 from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
@@ -10,9 +11,8 @@ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 from transformers.utils import logging
 from .constants import DEFAULT_IMAGE_TOKEN, MEDIA_TOKENS
-from .media import Image, Video
 from .mm_utils import process_image, process_images
-from .media import extract_media
 from .tokenizer_utils import tokenize_conversation
@@ -41,7 +41,7 @@ class VILAProcessor(ProcessorMixin):
         self.image_processor = image_processor
         self.tokenizer = tokenizer
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         if os.path.isdir(pretrained_model_name_or_path):
@@ -49,16 +49,23 @@ class VILAProcessor(ProcessorMixin):
         else:
             print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
             from huggingface_hub import HfApi, snapshot_download
             pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
-        image_processor = AutoImageProcessor.from_pretrained(osp.join(pretrained_model_name_or_path, "vision_tower"), trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True)
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
         return cls(image_processor=image_processor, tokenizer=tokenizer, config=config)
     def __repr__(self):
-        return f"VILAProcessor(image_processor={self.image_processor}, tokenizer={self.tokenizer}, config={self.config})"
     def __call__(
         self,
@@ -145,12 +152,9 @@ class VILAProcessor(ProcessorMixin):
     #     inputs = processor(conversation=llavaconv, padding=True, return_tensors="pt")
     def apply_chat_template(self, conversation, add_generation_prompt=True, **kwargs):
         vila_conv = []
         for chat in conversation:
-            vila_chat = {
-                "from": "",
-                "value": []
-            }
             if chat["role"] == "user":
                 # user allows to input image and text
                 vila_chat["from"] = "human"
@@ -167,9 +171,10 @@ class VILAProcessor(ProcessorMixin):
                     assert content["type"] == "text", f"Unsupported content type: {content['type']}"
                     vila_chat["value"].append(content["text"])
             vila_conv.append(vila_chat)
         return self(vila_conv)
 if __name__ == "__main__":
     # gpt style: user, assistant
     # vila style: human, gpt
@@ -178,8 +183,8 @@ if __name__ == "__main__":
             "role": "user",
             "content": [
                 {"type": "image", "path": "demo_images/demo_img_1.png"},
-                {"type": "text", "text": "Describe this image."}
-            ]
         }
     ]
@@ -211,7 +216,7 @@ if __name__ == "__main__":
         tokenizer=model.tokenizer,
     )
-    # TODO: add padding, return_tensors,
     inputs = processor(conversation=llavaconv, padding=True, return_tensors="pt")
     print(inputs.keys(), inputs.input_ids.shape, [_.shape for _ in inputs.image])
     print("vila conv pass")
@@ -225,10 +230,8 @@ if __name__ == "__main__":
         media={
             "image": inputs.image,
         },
-        media_config={
-            "image": {}
-        },
         generation_config=model.generation_config,
         max_new_tokens=100,
     )
-    print(output_ids)

+import os
+import os.path as osp
 from collections import defaultdict
 from typing import List, Union
+from transformers import AutoConfig, AutoImageProcessor, AutoModel, AutoProcessor, AutoTokenizer
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput, VideoInput
 from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from transformers.utils import logging
 from .constants import DEFAULT_IMAGE_TOKEN, MEDIA_TOKENS
+from .media import Image, Video, extract_media
 from .mm_utils import process_image, process_images
 from .tokenizer_utils import tokenize_conversation
         self.image_processor = image_processor
         self.tokenizer = tokenizer
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         if os.path.isdir(pretrained_model_name_or_path):
         else:
             print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
             from huggingface_hub import HfApi, snapshot_download
             pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
+        image_processor = AutoImageProcessor.from_pretrained(
+            osp.join(pretrained_model_name_or_path, "vision_tower"), trust_remote_code=True
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True
+        )
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
         return cls(image_processor=image_processor, tokenizer=tokenizer, config=config)
     def __repr__(self):
+        return (
+            f"VILAProcessor(image_processor={self.image_processor}, tokenizer={self.tokenizer}, config={self.config})"
+        )
     def __call__(
         self,
     #     inputs = processor(conversation=llavaconv, padding=True, return_tensors="pt")
     def apply_chat_template(self, conversation, add_generation_prompt=True, **kwargs):
         vila_conv = []
         for chat in conversation:
+            vila_chat = {"from": "", "value": []}
             if chat["role"] == "user":
                 # user allows to input image and text
                 vila_chat["from"] = "human"
                     assert content["type"] == "text", f"Unsupported content type: {content['type']}"
                     vila_chat["value"].append(content["text"])
             vila_conv.append(vila_chat)
         return self(vila_conv)
 if __name__ == "__main__":
     # gpt style: user, assistant
     # vila style: human, gpt
             "role": "user",
             "content": [
                 {"type": "image", "path": "demo_images/demo_img_1.png"},
+                {"type": "text", "text": "Describe this image."},
+            ],
         }
     ]
         tokenizer=model.tokenizer,
     )
+    # TODO: add padding, return_tensors,
     inputs = processor(conversation=llavaconv, padding=True, return_tensors="pt")
     print(inputs.keys(), inputs.input_ids.shape, [_.shape for _ in inputs.image])
     print("vila conv pass")
         media={
             "image": inputs.image,
         },
+        media_config={"image": {}},
         generation_config=model.generation_config,
         max_new_tokens=100,
     )
+    print(output_ids)

modeling_vila.py CHANGED Viewed

@@ -38,6 +38,7 @@ from transformers import (
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.modeling_utils import ContextManagers, no_init_weights
 from .base_projector import MultimodalProjector, MultimodalProjectorConfig
 from .builder import build_llm_and_tokenizer
 from .configuration_vila import VILAConfig
@@ -49,7 +50,7 @@ from .mm_utils import process_image, process_images
 from .siglip_encoder import SiglipVisionTower, SiglipVisionTowerDynamicS2, SiglipVisionTowerS2
 from .tokenizer_utils import tokenize_conversation
 from .utils import get_model_config, load_tokenizer_then_handle_media_tokens_and_chat_template
-from .auto_processor import VILAProcessor
 # from llava.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, NUM_EXTRA_TOKENS
 # quick hack for remote code
@@ -230,7 +231,7 @@ class VILAPretrainedModel(PreTrainedModel):
             api = HfApi()
             model_path = snapshot_download(model_path, local_dir=output_dir)
             print("downloading HF model to", model_path)
         if check_dot_in_model_path(model_path) and output_dir is None:
             raise ValueError(
                 f"Model path {model_path} contains a dot, which will affect the remote code loading. Please specify the output directory without dot in the path to fix this issue."
@@ -280,10 +281,10 @@ class VILAPretrainedModel(PreTrainedModel):
                 src_fname = os.path.join(current_folder, file_name)
                 dst_fname = os.path.join(output_dir, "README.md")
                 if os.path.exists(dst_fname):
-                    old_reamde = open(dst_fname, 'r').read()
                 else:
                     old_reamde = ""
-                with open(src_fname, 'r') as src, open(dst_fname, 'w') as dst:
                     dst.write(src.read())
                     dst.write(old_reamde)
                 print("[HF remote code] REAMDE ", src_fname, "to", dst_fname)
@@ -299,7 +300,6 @@ class VILAPretrainedModel(PreTrainedModel):
                             os.remove(os.path.join(output_dir, file_name))
                         os.symlink(full_file_name, os.path.join(output_dir, file_name))
                         print("[HF remote code] linking", full_file_name, "to", output_dir)
     def save_pretrained(self, output_dir, state_dict=None):
         if state_dict is None:

 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.modeling_utils import ContextManagers, no_init_weights
+from .auto_processor import VILAProcessor
 from .base_projector import MultimodalProjector, MultimodalProjectorConfig
 from .builder import build_llm_and_tokenizer
 from .configuration_vila import VILAConfig
 from .siglip_encoder import SiglipVisionTower, SiglipVisionTowerDynamicS2, SiglipVisionTowerS2
 from .tokenizer_utils import tokenize_conversation
 from .utils import get_model_config, load_tokenizer_then_handle_media_tokens_and_chat_template
 # from llava.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, NUM_EXTRA_TOKENS
 # quick hack for remote code
             api = HfApi()
             model_path = snapshot_download(model_path, local_dir=output_dir)
             print("downloading HF model to", model_path)
         if check_dot_in_model_path(model_path) and output_dir is None:
             raise ValueError(
                 f"Model path {model_path} contains a dot, which will affect the remote code loading. Please specify the output directory without dot in the path to fix this issue."
                 src_fname = os.path.join(current_folder, file_name)
                 dst_fname = os.path.join(output_dir, "README.md")
                 if os.path.exists(dst_fname):
+                    old_reamde = open(dst_fname).read()
                 else:
                     old_reamde = ""
+                with open(src_fname) as src, open(dst_fname, "w") as dst:
                     dst.write(src.read())
                     dst.write(old_reamde)
                 print("[HF remote code] REAMDE ", src_fname, "to", dst_fname)
                             os.remove(os.path.join(output_dir, file_name))
                         os.symlink(full_file_name, os.path.join(output_dir, file_name))
                         print("[HF remote code] linking", full_file_name, "to", output_dir)
     def save_pretrained(self, output_dir, state_dict=None):
         if state_dict is None:

utils.py CHANGED Viewed

@@ -19,15 +19,20 @@ import os.path as osp
 from huggingface_hub import repo_exists, snapshot_download
 from huggingface_hub.utils import HFValidationError, validate_repo_id
-from transformers import AutoConfig, PretrainedConfig, AutoTokenizer
 from .configuration_vila import VILAConfig
 from .constants import MEDIA_TOKENS
 from .tokenizer_utils import infer_stop_tokens
-def load_tokenizer_then_handle_media_tokens_and_chat_template(model_name_or_path, config: VILAConfig, model_max_length=None):
     # TODO(ligeng): a lot of copy-paste code, refactor to make a single function
-    tokenizer = AutoTokenizer.from_pretrained(osp.join(model_name_or_path, "llm"), padding_side="right", use_fast=True, legacy=False)
     if model_max_length is not None:
         tokenizer.model_max_length = model_max_length
@@ -54,6 +59,7 @@ def load_tokenizer_then_handle_media_tokens_and_chat_template(model_name_or_path
     return tokenizer
 def get_model_config(config):
     default_keys = ["llm_cfg", "vision_tower_cfg", "mm_projector_cfg"]

 from huggingface_hub import repo_exists, snapshot_download
 from huggingface_hub.utils import HFValidationError, validate_repo_id
+from transformers import AutoConfig, AutoTokenizer, PretrainedConfig
 from .configuration_vila import VILAConfig
 from .constants import MEDIA_TOKENS
 from .tokenizer_utils import infer_stop_tokens
+def load_tokenizer_then_handle_media_tokens_and_chat_template(
+    model_name_or_path, config: VILAConfig, model_max_length=None
+):
     # TODO(ligeng): a lot of copy-paste code, refactor to make a single function
+    tokenizer = AutoTokenizer.from_pretrained(
+        osp.join(model_name_or_path, "llm"), padding_side="right", use_fast=True, legacy=False
+    )
     if model_max_length is not None:
         tokenizer.model_max_length = model_max_length
     return tokenizer
 def get_model_config(config):
     default_keys = ["llm_cfg", "vision_tower_cfg", "mm_projector_cfg"]