Spaces:
Runtime error
Runtime error
| # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright: | |
| # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: | |
| # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: | |
| # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import re | |
| import os | |
| import copy | |
| import json | |
| import random | |
| import pathlib | |
| import traceback | |
| from dataclasses import dataclass, field | |
| from typing import Dict, Optional, Sequence, List | |
| # torch-related packages | |
| # NOTE: torch must be imported before transformers. Otherwise, `Segmentation fault (core dumped)` will occur. | |
| import torch | |
| from torch.utils.data import Dataset | |
| import transformers | |
| from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock | |
| import sys | |
| sys.path.append('./') | |
| from videollama2.model import * | |
| from videollama2.constants import NUM_FRAMES, IGNORE_INDEX, MODAL_INDEX_MAP | |
| from videollama2.mm_utils import tokenizer_multimodal_token, process_video, process_image, process_audio_file | |
| from videollama2.videollama2_trainer import (VideoLLaMA2Trainer, | |
| get_peft_state_maybe_zero_3, get_peft_state_non_lora_maybe_zero_3, | |
| find_all_linear_names, safe_save_model_for_hf_trainer | |
| ) | |
| # NOTE: fast tokenizer warning issue: https://github.com/huggingface/transformers/issues/5486 | |
| os.environ["TOKENIZERS_PARALLELISM"] = "true" | |
| local_rank = None | |
| def rank0_print(*args): | |
| if local_rank == 0: | |
| print(*args) | |
| def set_seed(seed=42): | |
| """ | |
| Set the random seed for reproducible results. | |
| :param seed: An integer value to be used as the random seed. | |
| """ | |
| torch.manual_seed(seed) | |
| torch.cuda.manual_seed(seed) | |
| torch.cuda.manual_seed_all(seed) # for multi-GPU setups | |
| torch.backends.cudnn.deterministic = True | |
| torch.backends.cudnn.benchmark = False | |
| class ModelArguments: | |
| # LLM Arguments | |
| model_type: Optional[str] = field(default="videollama2", metadata={"help": "Model type selected in the list: " + ", ".join(VLLMs.keys())}) | |
| model_path: Optional[str] = field(default="lmsys/vicuna-7b-v1.5") | |
| version: Optional[str] = field(default="v1", metadata={"help": "Version of the conversation template."}) | |
| freeze_backbone: bool = field(default=False, metadata={"help": "Whether to freeze the LLM backbone."}) | |
| tune_adapter_llm: bool = field(default=False) | |
| # Connector Arguments | |
| mm_projector_type: Optional[str] = field(default='linear') | |
| mm_projector_a_type: Optional[str] = field(default='linear') | |
| tune_mm_mlp_adapter: bool = field(default=False) | |
| tune_mm_mlp_adapter_a: bool = field(default=False) | |
| pretrain_mm_mlp_adapter: Optional[str] = field(default=None) | |
| pretrain_mm_mlp_adapter_a: Optional[str] = field(default=None) | |
| # Vision tower Arguments | |
| vision_tower: Optional[str] = field(default=None) | |
| mm_vision_select_layer: Optional[int] = field(default=-1) | |
| mm_vision_select_feature: Optional[str] = field(default="patch") | |
| # Audio tower Arguments | |
| audio_tower: Optional[str] = field(default=None) | |
| tune_audio_tower: bool = field(default=False) | |
| class DataArguments: | |
| # Path Arguments | |
| data_path: str = field(default=None, metadata={"help": "Path to the training data."}) | |
| data_path_a: Optional[str] = field(default=None, metadata={"help": "Path to the audio data."}) | |
| # image_folder: Optional[str] = field(default=None) | |
| # video_folder: Optional[str] = field(default=None) | |
| data_folder: Optional[str] = field(default=None) | |
| # Loading Arguments | |
| is_multimodal: bool = False | |
| va: bool = field(default=False) | |
| lazy_preprocess: bool = False | |
| num_frames: Optional[int] = field(default=None) | |
| # Preprocess Arguments | |
| image_aspect_ratio: str = 'square' | |
| class TrainingArguments(transformers.TrainingArguments): | |
| optim: str = field(default="adamw_torch") | |
| mm_projector_lr: Optional[float] = None | |
| freeze_mm_mlp_adapter: bool = field(default=False) | |
| remove_unused_columns: bool = field(default=False) | |
| cache_dir: Optional[str] = field(default=None) | |
| # Training Data Arguments | |
| group_by_modality_length: bool = field(default=False) | |
| model_max_length: int = field( | |
| default=512, | |
| metadata={ | |
| "help": | |
| "Maximum sequence length. Sequences will be right padded (and possibly truncated)." | |
| }, | |
| ) | |
| # Lora or Quant Arguments | |
| double_quant: bool = field( | |
| default=True, | |
| metadata={"help": "Compress the quantization statistics through double quantization."} | |
| ) | |
| quant_type: str = field( | |
| default="nf4", | |
| metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."} | |
| ) | |
| bits: int = field( | |
| default=16, | |
| metadata={"help": "How many bits to use."} | |
| ) | |
| lora_enable: bool = False | |
| lora_r: int = 64 | |
| lora_alpha: int = 16 | |
| lora_dropout: float = 0.05 | |
| lora_weight_path: str = "" | |
| lora_bias: str = "none" | |
| def preprocess_plain( | |
| sources: Sequence[str], | |
| tokenizer: transformers.PreTrainedTokenizer, | |
| modal_token: str = None, | |
| ) -> Dict: | |
| roles = {"human": "user", "gpt": "assistant"} | |
| conversations = [] | |
| input_ids = [] | |
| targets = [] | |
| #print(sources) | |
| for source in sources: | |
| # 1. apply chat template for input conversation | |
| assert len(source) == 2 | |
| assert modal_token in source[0]['value'] | |
| message = [ | |
| {'role': 'user', 'content': modal_token}, | |
| {'role': 'assistant', 'content': source[1]['value']} | |
| ] | |
| conversation = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) | |
| #print(conversation) //<s> [INST] <audio> [/INST] Someone is speaking.</s> | |
| # 2. tokenize conversations | |
| input_ids.append(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt')) | |
| # 3. make targets | |
| targets.append(copy.deepcopy(input_ids[-1])) | |
| #print(targets) | |
| instruction = tokenizer.apply_chat_template(message[:1], tokenize=False, add_generation_prompt=True) | |
| #print(instruction) //<s> [INST] <audio> [/INST] | |
| instruction_len = len(tokenizer_multimodal_token(instruction, tokenizer, modal_token, return_tensors='pt')) | |
| #print(instruction_len) //12 | |
| targets[-1][:instruction_len] = IGNORE_INDEX | |
| # print("instruction: ----------------") | |
| # print(instruction) | |
| # print("conversation: ----------------") | |
| # print(conversation) | |
| # print("training targets: ----------------") | |
| # print(tokenizer.decode(targets[-1][instruction_len:])) | |
| # print(input_ids[-1]) | |
| # print(targets[-1]) | |
| return dict(input_ids=input_ids, labels=targets) | |
| def preprocess( | |
| sources: Sequence[str], | |
| tokenizer: transformers.PreTrainedTokenizer, | |
| modal_token: str = None, | |
| ) -> Dict: | |
| roles = {"human": "user", "gpt": "assistant"} | |
| # Apply prompt templates | |
| conversations = [] | |
| input_ids = [] | |
| targets = [] | |
| for i, source in enumerate(sources): | |
| if roles[source[0]["from"]] != "user": | |
| # Skip the first one if it is not from human | |
| source = source[1:] | |
| message = [{'role': roles[sentence['from']], 'content': sentence['value']} for sentence in source] | |
| conversation = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) | |
| #print(conversation) | |
| input_ids.append(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt')) | |
| #print(input_ids) | |
| targets.append(copy.deepcopy(input_ids[-1])) | |
| #print(targets) | |
| assert len(source) % 2 == 0, f"Invalid conversation length {len(source)}." | |
| cur = 0 | |
| message = [] | |
| for idx, sentence in enumerate(source): | |
| if idx % 2 == 1: | |
| tmp_message = [ | |
| {'role': roles[source[idx-1]['from']], 'content': source[idx-1]['value']}, | |
| {'role': roles[sentence['from']], 'content': sentence['value']} | |
| ] | |
| instruction = tokenizer.apply_chat_template(message + tmp_message[:1], tokenize=False, add_generation_prompt=True) | |
| conversation = tokenizer.apply_chat_template(message + tmp_message, tokenize=False, add_generation_prompt=False) | |
| instruction_len = len(tokenizer_multimodal_token(instruction, tokenizer, modal_token, return_tensors='pt')) | |
| conversation_len = len(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt')) | |
| targets[-1][cur:instruction_len] = IGNORE_INDEX | |
| #print(targets[-1]) | |
| cur = conversation_len | |
| message += tmp_message | |
| return dict(input_ids=input_ids, labels=targets) | |
| def preprocess_multimodal( | |
| sources: Sequence[str], | |
| data_args: DataArguments, | |
| modal_token: str = None, | |
| ) -> Dict: | |
| is_multimodal = data_args.is_multimodal | |
| if not is_multimodal: | |
| return sources | |
| assert modal_token in MODAL_INDEX_MAP, f"Unsupported modal token {modal_token}." | |
| for source in sources: | |
| for sentence in source: | |
| if modal_token in sentence['value']: | |
| sentence['value'] = sentence['value'].replace(modal_token, '').strip() | |
| sentence['value'] = modal_token + '\n' + sentence['value'] | |
| sentence['value'] = sentence['value'].strip() | |
| replace_token = modal_token | |
| # TODO: fix this for multimedia, e.g., <video>, <audio>, etc. | |
| sentence["value"] = sentence["value"].replace(modal_token, replace_token) | |
| return sources | |
| class LazySupervisedDataset(Dataset): | |
| """Dataset for supervised fine-tuning.""" | |
| def __init__(self, data_path: str, data_path_a: str, | |
| tokenizer: transformers.PreTrainedTokenizer, | |
| data_args: DataArguments): | |
| super(LazySupervisedDataset, self).__init__() | |
| self.mix_sampler_tag = False | |
| if data_path is not None and len(data_path.split(",")) == 1: | |
| data_path = data_path.split(",")[0] | |
| list_data_dict = json.load(open(data_path, "r")) | |
| elif data_path is not None and len(data_path.split(",")) > 1: | |
| self.mix_sampler_tag = True | |
| data_path = data_path.split(",") | |
| for path in data_path: | |
| if "stage3" in path: | |
| self.av_data = json.load(open(path, "r")) | |
| random.shuffle(self.av_data) | |
| elif "stage2" in path and "audio" in path: | |
| self.a_data = json.load(open(path, "r")) | |
| random.shuffle(self.a_data) | |
| elif "stage2" in path and "video" in path: | |
| self.v_data = json.load(open(path, "r")) | |
| random.shuffle(self.v_data) | |
| else: | |
| raise NotImplementedError | |
| list_data_dict = self.av_data + self.a_data + self.v_data | |
| if data_path_a is not None: | |
| list_data_dict = json.load(open(data_path_a, "r")) | |
| rank0_print("Formatting inputs...Skip in lazy mode") | |
| self.tokenizer = tokenizer | |
| self.list_data_dict = list_data_dict | |
| self.data_args = data_args | |
| def __len__(self): | |
| return len(self.list_data_dict) | |
| def lengths(self): | |
| length_list = [] | |
| for sample in self.list_data_dict: | |
| img_tokens = 576 if 'image' in sample else 0 | |
| length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens) | |
| return length_list | |
| def modality_lengths(self): | |
| length_list = [] | |
| for sample in self.list_data_dict: | |
| cur_len = sum(len(conv['value'].split()) for conv in sample['conversations']) | |
| cur_len = cur_len if 'image' in sample else -cur_len | |
| length_list.append(cur_len) | |
| return length_list | |
| def __getitem__(self, i) -> Dict[str, torch.Tensor]: | |
| sources = self.list_data_dict[i] | |
| if isinstance(i, int): | |
| sources = [sources] | |
| assert len(sources) == 1, "Don't know why it is wrapped to a list" # FIXME | |
| if self.data_args.data_path is not None: | |
| image_processor = self.data_args.image_processor | |
| video_processor = self.data_args.video_processor | |
| num_frames = NUM_FRAMES if self.data_args.num_frames is None else self.data_args.num_frames | |
| if 'image' in sources[0]: | |
| image_file = self.list_data_dict[i]['image'] | |
| image_folder = self.data_args.data_folder | |
| image_file = os.path.join(image_folder, image_file) | |
| try: | |
| image = process_image(image_file, image_processor, aspect_ratio=self.data_args.image_aspect_ratio) | |
| except: | |
| traceback.print_exc() | |
| backup_idx = random.randint(0, len(self.list_data_dict) - 1) | |
| print(f"Encounted error when reading image {image_file}, use {backup_idx}-th example instead!!!") | |
| return self.__getitem__(backup_idx) | |
| # place <image> tag to question head. | |
| modal_token = "<image>" | |
| sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token) | |
| elif 'video' in sources[0]: | |
| video_file = self.list_data_dict[i]['video'] | |
| video_folder = self.data_args.data_folder | |
| if video_folder: | |
| video_file = os.path.join(video_folder, video_file) | |
| try: | |
| video = process_video(video_file, video_processor, aspect_ratio=self.data_args.image_aspect_ratio, num_frames=num_frames, va = self.data_args.va if not self.mix_sampler_tag else (i < len(self.av_data))) | |
| except Exception as e: | |
| traceback.print_exc() | |
| backup_idx = random.randint(0, len(self.list_data_dict) - 1) | |
| print(f"Encounted error when reading video {video_file}, use {backup_idx}-th example instead!!!") | |
| return self.__getitem__(backup_idx) | |
| # place <video> tag to question head. | |
| modal_token = "<video>" | |
| sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token) | |
| elif 'audio' in sources[0]: | |
| audio_file = self.list_data_dict[i]['audio'] | |
| #audio_folder = self.data_args.base_folder | |
| #print(audio_file) | |
| try: | |
| audio = process_audio_file(audio_file) | |
| except Exception as e: | |
| print(e) | |
| backup_idx = random.randint(0, len(self.list_data_dict)-1) | |
| print(f"Encounted error when reading audio {audio_file}, use {backup_idx}-th example instead!!!") | |
| return self.__getitem__(backup_idx) | |
| modal_token = "<audio>" | |
| sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token) | |
| else: | |
| modal_token = None | |
| sources = copy.deepcopy([e["conversations"] for e in sources]) | |
| if self.data_args.is_pretraining: | |
| data_dict = preprocess_plain(sources, self.tokenizer, modal_token=modal_token) | |
| else: | |
| data_dict = preprocess(sources, self.tokenizer, modal_token=modal_token) | |
| if isinstance(i, int): | |
| data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0]) | |
| # image exist in the data | |
| if 'image' in self.list_data_dict[i]: | |
| data_dict['image'] = image | |
| elif 'video' in self.list_data_dict[i]: | |
| data_dict['video'] = video | |
| elif 'audio' in self.list_data_dict[i]: | |
| data_dict['audio'] = audio | |
| elif self.data_args.data_path_a: | |
| # image does not exist in the data, but the model is multimodal | |
| data_dict['audio'] = torch.zeros(1, 2998, 128) | |
| elif self.data_args.is_multimodal: | |
| # image does not exist in the data, but the model is multimodal | |
| data_dict['image'] = torch.zeros(3, self.data_args.image_size, self.data_args.image_size) | |
| return data_dict | |
| class DataCollatorForSupervisedDataset(object): | |
| """Collate examples for supervised fine-tuning.""" | |
| tokenizer: transformers.PreTrainedTokenizer | |
| def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: | |
| input_ids, labels = tuple([instance[key] for instance in instances] | |
| for key in ("input_ids", "labels")) | |
| input_ids = torch.nn.utils.rnn.pad_sequence( | |
| input_ids, | |
| batch_first=True, | |
| padding_value=self.tokenizer.pad_token_id) | |
| labels = torch.nn.utils.rnn.pad_sequence(labels, | |
| batch_first=True, | |
| padding_value=IGNORE_INDEX) | |
| input_ids = input_ids[:, :self.tokenizer.model_max_length] | |
| labels = labels[:, :self.tokenizer.model_max_length] | |
| batch = dict( | |
| input_ids=input_ids, | |
| labels=labels, | |
| attention_mask=input_ids.ne(self.tokenizer.pad_token_id), | |
| ) | |
| # work for 'images' argument in `prepare_inputs_labels_for_multimodal` of LlavaMetaForCausalLM in llava_arch.py | |
| batch['images'] = [] | |
| for instance in instances: | |
| for modal_token in MODAL_INDEX_MAP.keys(): | |
| modal_token = modal_token.lower() | |
| # MODAL_TOKEN shape like: <image>, <video>, ... | |
| modal_name = re.findall(f'[<](.*)[>]', modal_token) | |
| assert len(modal_name) == 1 | |
| modal_name = modal_name[0] | |
| if modal_name in instance: | |
| batch['images'].append((instance[modal_name], modal_name)) | |
| return batch | |
| def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, | |
| data_args) -> Dict: | |
| """Make dataset and collator for supervised fine-tuning.""" | |
| train_dataset = LazySupervisedDataset( | |
| tokenizer=tokenizer, | |
| data_path=data_args.data_path, | |
| data_path_a=data_args.data_path_a, | |
| data_args=data_args | |
| ) | |
| data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) | |
| return dict(train_dataset=train_dataset, | |
| eval_dataset=None, | |
| data_collator=data_collator) | |
| def train(attn_implementation="flash_attention_2"): | |
| global local_rank | |
| set_seed(42) | |
| parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) | |
| model_args, data_args, training_args = parser.parse_args_into_dataclasses() | |
| local_rank = training_args.local_rank | |
| compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) | |
| bnb_model_from_pretrained_args = {} | |
| if training_args.bits in [4, 8]: | |
| from transformers import BitsAndBytesConfig | |
| bnb_model_from_pretrained_args.update(dict( | |
| # device_map={"": training_args.device}, | |
| # BUG: High version transformers report error: | |
| # ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time | |
| # load_in_4bit=training_args.bits == 4, | |
| # load_in_8bit=training_args.bits == 8, | |
| quantization_config=BitsAndBytesConfig( | |
| load_in_4bit=training_args.bits == 4, | |
| load_in_8bit=training_args.bits == 8, | |
| llm_int8_skip_modules=["mm_projector"], | |
| llm_int8_threshold=6.0, | |
| llm_int8_has_fp16_weight=False, | |
| bnb_4bit_compute_dtype=compute_dtype, | |
| bnb_4bit_use_double_quant=training_args.double_quant, | |
| bnb_4bit_quant_type=training_args.quant_type, # {'fp4', 'nf4'} | |
| bnb_4bit_quant_storage=compute_dtype, | |
| ) | |
| )) | |
| config = VLLMConfigs[model_args.model_type].from_pretrained(model_args.model_path, trust_remote_code=True) | |
| if 'gemma2' in model_args.model_type: | |
| config._attn_implementation = 'eager' | |
| else: | |
| config._attn_implementation = attn_implementation | |
| if model_args.vision_tower is not None or model_args.audio_tower is not None: | |
| model = VLLMs[model_args.model_type].from_pretrained( | |
| model_args.model_path, | |
| config=config, | |
| cache_dir=training_args.cache_dir, | |
| torch_dtype=(torch.bfloat16 if training_args.bf16 else None), | |
| do_sample=True, | |
| **bnb_model_from_pretrained_args | |
| ) | |
| if 'mixtral' in model_args.model_type: | |
| import deepspeed | |
| deepspeed.utils.set_z3_leaf_modules(model, [MixtralSparseMoeBlock]) | |
| else: | |
| model = transformers.LlamaForCausalLM.from_pretrained( | |
| model_args.model_path, | |
| config=config, | |
| cache_dir=training_args.cache_dir, | |
| torch_dtype=(torch.bfloat16 if training_args.bf16 else None), | |
| do_sample=True, | |
| **bnb_model_from_pretrained_args | |
| ) | |
| model.config.use_cache = False | |
| if training_args.bits in [4, 8]: | |
| from peft import prepare_model_for_kbit_training | |
| model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) | |
| model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing) | |
| if training_args.gradient_checkpointing: | |
| if hasattr(model, "enable_input_require_grads"): | |
| model.enable_input_require_grads() | |
| else: | |
| def make_inputs_require_grad(module, input, output): | |
| output.requires_grad_(True) | |
| model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) | |
| if training_args.lora_enable: | |
| from peft import LoraConfig, get_peft_model | |
| lora_config = LoraConfig( | |
| r=training_args.lora_r, | |
| lora_alpha=training_args.lora_alpha, | |
| target_modules=find_all_linear_names(model), | |
| lora_dropout=training_args.lora_dropout, | |
| bias=training_args.lora_bias, | |
| task_type="CAUSAL_LM", | |
| ) | |
| if training_args.bits == 16: | |
| if training_args.bf16: | |
| model.to(torch.bfloat16) | |
| if training_args.fp16: | |
| model.to(torch.float16) | |
| rank0_print("Adding LoRA adapters...") | |
| model = get_peft_model(model, lora_config) | |
| tokenizer = transformers.AutoTokenizer.from_pretrained( | |
| model_args.model_path, | |
| cache_dir=training_args.cache_dir, | |
| model_max_length=training_args.model_max_length, | |
| padding_side="right", | |
| use_fast=True, | |
| ) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.unk_token | |
| if model_args.vision_tower is not None: | |
| # initialize vision encoder + multi-modal projector | |
| model.get_model().initialize_vision_modules(model_args=model_args, fsdp=training_args.fsdp) | |
| vision_tower = model.get_vision_tower() | |
| vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device) | |
| data_args.image_size = vision_tower.image_size | |
| data_args.image_processor = vision_tower.image_processor | |
| data_args.video_processor = vision_tower.video_processor if hasattr(vision_tower, "video_processor") else vision_tower.image_processor | |
| data_args.is_multimodal = True | |
| model.config.image_aspect_ratio = data_args.image_aspect_ratio | |
| model.config.tokenizer_padding_side = tokenizer.padding_side | |
| model.config.tokenizer_model_max_length = tokenizer.model_max_length | |
| model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter | |
| if model_args.tune_mm_mlp_adapter: | |
| model.requires_grad_(False) | |
| for p in model.get_model().mm_projector.parameters(): | |
| p.requires_grad = True | |
| if model_args.tune_mm_mlp_adapter: | |
| data_args.is_pretraining = True | |
| else: | |
| data_args.is_pretraining = False | |
| model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter | |
| if training_args.freeze_mm_mlp_adapter: | |
| for p in model.get_model().mm_projector.parameters(): | |
| p.requires_grad = False | |
| if training_args.bits in [4, 8]: | |
| model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device) | |
| model.config.mm_projector_lr = training_args.mm_projector_lr | |
| model.config.num_frames = NUM_FRAMES if data_args.num_frames is None else data_args.num_frames | |
| if model_args.audio_tower is not None: | |
| # initialize audio encoder + multi-modal projector | |
| model.get_model().initialize_audio_modules( | |
| model_args=model_args, | |
| fsdp=training_args.fsdp | |
| ) | |
| audio_tower = model.get_audio_tower() | |
| audio_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device) | |
| data_args.is_multimodal = True | |
| model.config.tokenizer_padding_side = tokenizer.padding_side | |
| model.config.tokenizer_model_max_length = tokenizer.model_max_length | |
| model.config.tune_mm_mlp_adapter_a = training_args.tune_mm_mlp_adapter_a = model_args.tune_mm_mlp_adapter_a | |
| training_args.pretrain_mm_mlp_adapter_a = model_args.pretrain_mm_mlp_adapter_a | |
| training_args.tune_audio_tower = model_args.tune_audio_tower | |
| # only update mm_mlp's parameters while the remaining ones are kept frozen | |
| if model_args.tune_mm_mlp_adapter_a: | |
| model.requires_grad_(False) | |
| for p in model.get_model().mm_projector_a.parameters(): | |
| p.requires_grad = True | |
| if model_args.tune_audio_tower or model_args.tune_adapter_llm: | |
| data_args.is_pretraining = False | |
| else: | |
| data_args.is_pretraining = True | |
| model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter | |
| if training_args.freeze_mm_mlp_adapter: | |
| for p in model.get_model().mm_projector_a.parameters(): | |
| p.requires_grad = False | |
| if model_args.tune_adapter_llm: | |
| model.requires_grad_(True) | |
| if hasattr(model.get_model(), 'vision_tower'): | |
| for p in model.get_model().vision_tower.parameters(): | |
| p.requires_grad = False | |
| for p in model.get_model().audio_tower.parameters(): | |
| p.requires_grad = False | |
| if model_args.freeze_backbone: | |
| model.requires_grad_(False) | |
| if model_args.tune_audio_tower: | |
| for p in model.get_model().audio_tower.parameters(): | |
| p.requires_grad = True | |
| else: | |
| for p in model.get_model().audio_tower.parameters(): | |
| p.requires_grad = False | |
| if training_args.bits in [4, 8]: | |
| model.get_model().mm_projector_a.to(dtype=compute_dtype, device=training_args.device) | |
| model.config.mm_projector_lr = training_args.mm_projector_lr | |
| if training_args.bits in [4, 8]: | |
| from peft.tuners.lora import LoraLayer | |
| for name, module in model.named_modules(): | |
| if isinstance(module, LoraLayer): | |
| if training_args.bf16: | |
| module = module.to(torch.bfloat16) | |
| if 'norm' in name: | |
| module = module.to(torch.float32) | |
| if 'lm_head' in name or 'embed_tokens' in name: | |
| if hasattr(module, 'weight'): | |
| if training_args.bf16 and module.weight.dtype == torch.float32: | |
| module = module.to(torch.bfloat16) | |
| print("Current model:", model) | |
| ''' | |
| for name, param in model.named_parameters(): | |
| # Check if the parameter requires gradient | |
| if param.requires_grad: | |
| print(f'Parameter: {name} is trainable') | |
| else: | |
| print(f'Parameter: {name} is frozen') | |
| ''' | |
| data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args) | |
| # select a Trainer | |
| trainer = VideoLLaMA2Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module) | |
| if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): | |
| trainer.train(resume_from_checkpoint=True) | |
| else: | |
| trainer.train() | |
| trainer.save_state() | |
| model.config.use_cache = True | |
| if training_args.lora_enable: | |
| state_dict = get_peft_state_maybe_zero_3(model.named_parameters(), training_args.lora_bias) | |
| non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(model.named_parameters()) | |
| if training_args.local_rank == 0 or training_args.local_rank == -1: | |
| model.config.save_pretrained(training_args.output_dir) | |
| model.save_pretrained(training_args.output_dir, state_dict=state_dict) | |
| torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin')) | |
| else: | |
| safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) | |
| if __name__ == "__main__": | |
| train() | |