diff --git a/.gitattributes b/.gitattributes index 5167be1c2d32403222cec29a70185edf980f3f38..dd5450dc7afd2aec66d2967223bad928c264ff50 100644 --- a/.gitattributes +++ b/.gitattributes @@ -38,3 +38,8 @@ Unicorn/wandb/run-20260114_135552-sjoswxwz/run-sjoswxwz.wandb filter=lfs diff=lf Unicorn/wandb/run-20260114_170827-uobkoafb/run-uobkoafb.wandb filter=lfs diff=lfs merge=lfs -text Unicorn/wandb/run-20260115_103501-4tsjsu0t/run-4tsjsu0t.wandb filter=lfs diff=lfs merge=lfs -text Unicorn/wandb/run-20260115_230712-6c574jt7/run-6c574jt7.wandb filter=lfs diff=lfs merge=lfs -text +Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/run-idx5bv8k.wandb filter=lfs diff=lfs merge=lfs -text +Unicorn_dense/wandb/run-20260114_135602-iex36np1/run-iex36np1.wandb filter=lfs diff=lfs merge=lfs -text +Unicorn_dense/wandb/run-20260114_170827-as22o934/run-as22o934.wandb filter=lfs diff=lfs merge=lfs -text +Unicorn_dense/wandb/run-20260115_103508-pagdc54a/run-pagdc54a.wandb filter=lfs diff=lfs merge=lfs -text +Unicorn_dense/wandb/run-20260115_230908-tzltcduc/run-tzltcduc.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/Unicorn_dense/.DS_Store b/Unicorn_dense/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..98415cde298f36a345107bee75c04be6869a8b00 Binary files /dev/null and b/Unicorn_dense/.DS_Store differ diff --git a/Unicorn_dense/bunny/.DS_Store b/Unicorn_dense/bunny/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..79830bb6f17a58bc4c7e8b13ea6da2f09c09e252 Binary files /dev/null and b/Unicorn_dense/bunny/.DS_Store differ diff --git a/Unicorn_dense/bunny/__init__.py b/Unicorn_dense/bunny/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Unicorn_dense/bunny/__pycache__/__init__.cpython-310.pyc b/Unicorn_dense/bunny/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd10a168bf8aaa537639a0da88b7c68d301a216c Binary files /dev/null and b/Unicorn_dense/bunny/__pycache__/__init__.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/__pycache__/constants.cpython-310.pyc b/Unicorn_dense/bunny/__pycache__/constants.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06fda57c782b92a41cf515fa862ace4409b25ae1 Binary files /dev/null and b/Unicorn_dense/bunny/__pycache__/constants.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/__pycache__/conversation.cpython-310.pyc b/Unicorn_dense/bunny/__pycache__/conversation.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..109c040a57f3c7792c50d5f2d5cab999ed702403 Binary files /dev/null and b/Unicorn_dense/bunny/__pycache__/conversation.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/constants.py b/Unicorn_dense/bunny/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..2ea90e94fbd388819ec785304b852459c3ea2735 --- /dev/null +++ b/Unicorn_dense/bunny/constants.py @@ -0,0 +1,7 @@ +# Model Constants +IGNORE_INDEX = -100 +IMAGE_TOKEN_INDEX = -200 +DEFAULT_IMAGE_TOKEN = "" +CONTROLLER_HEART_BEAT_EXPIRATION = 30 +LOGDIR = "gradio-logs" +WORKER_HEART_BEAT_INTERVAL = 15 diff --git a/Unicorn_dense/bunny/conversation.py b/Unicorn_dense/bunny/conversation.py new file mode 100644 index 0000000000000000000000000000000000000000..f1bad84cef55de74d87dcb2f3fc5852de2e48df7 --- /dev/null +++ b/Unicorn_dense/bunny/conversation.py @@ -0,0 +1,239 @@ +import dataclasses +from enum import auto, Enum +from typing import List + + +class SeparatorStyle(Enum): + """Different separator style.""" + TWO = auto() + PLAIN = auto() + + +@dataclasses.dataclass +class Conversation: + """A class that keeps all conversation history.""" + system: str + roles: List[str] + messages: List[List[str]] + offset: int + sep_style: SeparatorStyle + sep: str = "###" + sep2: str = None + version: str = "Unknown" + + skip_next: bool = False + + def get_prompt(self): + messages = self.messages + if len(messages) > 0 and type(messages[0][1]) is tuple: + messages = self.messages.copy() + init_role, init_msg = messages[0].copy() + init_msg = init_msg[0].replace("", "").strip() + if 'mmtag' in self.version: + messages[0] = (init_role, init_msg) + messages.insert(0, (self.roles[0], "")) + messages.insert(1, (self.roles[1], "Received.")) + else: + messages[0] = (init_role, "\n" + init_msg) + + if self.sep_style == SeparatorStyle.TWO: + seps = [self.sep, self.sep2] + ret = self.system + seps[0] + for i, (role, message) in enumerate(messages): + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + ": " + message + seps[i % 2] + else: + ret += role + ":" + + elif self.sep_style == SeparatorStyle.PLAIN: + seps = [self.sep, self.sep2] + ret = self.system + for i, (role, message) in enumerate(messages): + if message: + if type(message) is tuple: + message, _, _ = message + ret += message + seps[i % 2] + else: + ret += "" + else: + raise ValueError(f"Invalid style: {self.sep_style}") + + return ret + + def append_message(self, role, message): + self.messages.append([role, message]) + + def get_images(self, return_pil=False): + images = [] + for i, (role, msg) in enumerate(self.messages[self.offset:]): + if i % 2 == 0: + if type(msg) is tuple: + import base64 + from io import BytesIO + from PIL import Image + msg, image, image_process_mode = msg + if image_process_mode == "Pad": + def expand2square(pil_img, background_color=(122, 116, 104)): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + + image = expand2square(image) + elif image_process_mode in ["Default", "Crop"]: + pass + elif image_process_mode == "Resize": + image = image.resize((336, 336)) + else: + raise ValueError(f"Invalid image_process_mode: {image_process_mode}") + + if return_pil: + images.append(image) + else: + buffered = BytesIO() + image.save(buffered, format="PNG") + img_b64_str = base64.b64encode(buffered.getvalue()).decode() + images.append(img_b64_str) + return images + + def to_gradio_chatbot(self): + ret = [] + for i, (role, msg) in enumerate(self.messages[self.offset:]): + if i % 2 == 0: + if type(msg) is tuple: + import base64 + from io import BytesIO + msg, image, image_process_mode = msg + max_hw, min_hw = max(image.size), min(image.size) + aspect_ratio = max_hw / min_hw + max_len, min_len = 800, 400 + shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) + longest_edge = int(shortest_edge * aspect_ratio) + W, H = image.size + if H > W: + H, W = longest_edge, shortest_edge + else: + H, W = shortest_edge, longest_edge + image = image.resize((W, H)) + buffered = BytesIO() + image.save(buffered, format="JPEG") + img_b64_str = base64.b64encode(buffered.getvalue()).decode() + img_str = f'user upload image' + msg = img_str + msg.replace('', '').strip() + ret.append([msg, None]) + else: + ret.append([msg, None]) + else: + ret[-1][-1] = msg + return ret + + def copy(self): + return Conversation( + system=self.system, + roles=self.roles, + messages=[[x, y] for x, y in self.messages], + offset=self.offset, + sep_style=self.sep_style, + sep=self.sep, + sep2=self.sep2, + version=self.version) + + def dict(self): + if len(self.get_images()) > 0: + return { + "system": self.system, + "roles": self.roles, + "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages], + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + } + return { + "system": self.system, + "roles": self.roles, + "messages": self.messages, + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + } + + +conv_bunny = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions.", + roles=("USER", "ASSISTANT"), + version="bunny", + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="<|endoftext|>", +) + +conv_phi3 = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions.", + roles=("USER", "ASSISTANT"), + version="phi3", + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="<|endoftext|>", +) + +conv_minicpm = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions.", + roles=("USER", "ASSISTANT"), + version="minicpm", + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", +) + +conv_llama = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions.", + roles=("USER", "ASSISTANT"), + version="llama", + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="<|end_of_text|>", +) + +conv_plain = Conversation( + system="", + roles=("", ""), + messages=( + ), + offset=0, + sep_style=SeparatorStyle.PLAIN, + sep="\n", +) + +default_conversation = conv_bunny +conv_templates = { + "default": conv_bunny, + "bunny": conv_bunny, + "phi3": conv_phi3, + "plain": conv_plain, + 'minicpm': conv_minicpm, + 'llama': conv_llama +} + +if __name__ == "__main__": + print(default_conversation.get_prompt()) diff --git a/Unicorn_dense/bunny/model/.DS_Store b/Unicorn_dense/bunny/model/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..1154d90116341ce56c406896462d1046ca85673a Binary files /dev/null and b/Unicorn_dense/bunny/model/.DS_Store differ diff --git a/Unicorn_dense/bunny/model/__init__.py b/Unicorn_dense/bunny/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ba67a6176d4a57113002e57408edacde300b7eb7 --- /dev/null +++ b/Unicorn_dense/bunny/model/__init__.py @@ -0,0 +1,6 @@ +from .language_model.bunny_phi import BunnyPhiForCausalLM, BunnyPhiConfig +from .language_model.bunny_stablelm import BunnyStableLMForCausalLM, BunnyStableLMConfig +from .language_model.bunny_qwen import BunnyQwen2ForCausalLM, BunnyQwen2Config +from .language_model.bunny_minicpm import BunnyMiniCPMForCausalLM, BunnyMiniCPMConfig +from .language_model.bunny_llama import BunnyLlamaForCausalLM, BunnyLlamaConfig +from .language_model.bunny_phi3 import BunnyPhi3ForCausalLM, BunnyPhi3Config diff --git a/Unicorn_dense/bunny/model/__pycache__/__init__.cpython-310.pyc b/Unicorn_dense/bunny/model/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c746242b6cd929f3b88b0755ffd47dd5cc7c6873 Binary files /dev/null and b/Unicorn_dense/bunny/model/__pycache__/__init__.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/__pycache__/bunny_arch.cpython-310.pyc b/Unicorn_dense/bunny/model/__pycache__/bunny_arch.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3113649d059e3c8ac471a8e48d832038dba5be90 Binary files /dev/null and b/Unicorn_dense/bunny/model/__pycache__/bunny_arch.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/builder.py b/Unicorn_dense/bunny/model/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..6694855d05c99adcd21bc566c34cba79387c0079 --- /dev/null +++ b/Unicorn_dense/bunny/model/builder.py @@ -0,0 +1,49 @@ +import torch +import transformers +from transformers import AutoModelForCausalLM, AutoTokenizer +import warnings +import transformers +# disable some warnings +transformers.logging.set_verbosity_error() +transformers.logging.disable_progress_bar() +warnings.filterwarnings('ignore') + +import sys + +# 把 /data/xmyu/Bunny_text/ 加进 sys.path,以便后续 import +sys.path.insert(0, "/data/xmyu/Bunny_text") +from bunny.model.language_model.bunny_llama import BunnyLlamaConfig, BunnyLlamaForCausalLM + + +def load_pretrained_model(model_path, model_base, model_name, model_type, load_8bit=False, load_4bit=False, + device_map="auto", device="cuda", **kwargs): + + # Our Model + # model = AutoModelForCausalLM.from_pretrained( + # '/data/xmyu/finished-checkpoints/no-transfer/checkpoints-llama3-8b/bunny-llama3-8b', + # torch_dtype=torch.float16, # float32 for cpu + # trust_remote_code=True + # # device_map='auto' + # ).to("cuda") + + # tokenizer = AutoTokenizer.from_pretrained( + # '/data/xmyu/finished-checkpoints/no-transfer/checkpoints-llama3-8b/bunny-llama3-8b', + # trust_remote_code=True + # ) + + + # Our Model + model = AutoModelForCausalLM.from_pretrained( + '/data/xmyu/finished-checkpoints/mean_shift/checkpoints-llama3-8b/bunny-llama3-8b', + torch_dtype=torch.float16, # float32 for cpu + trust_remote_code=True + # device_map='auto' + ).to("cuda") + + tokenizer = AutoTokenizer.from_pretrained( + '/data/xmyu/finished-checkpoints/mean_shift/checkpoints-llama3-8b/bunny-llama3-8b', + trust_remote_code=True + ) + + + return tokenizer, model, 512 diff --git a/Unicorn_dense/bunny/model/bunny_arch.py b/Unicorn_dense/bunny/model/bunny_arch.py new file mode 100644 index 0000000000000000000000000000000000000000..0c947e0ac7a2abe83cf7d694990efd5257893d97 --- /dev/null +++ b/Unicorn_dense/bunny/model/bunny_arch.py @@ -0,0 +1,244 @@ +from abc import ABC, abstractmethod +import os +import torch +from .multimodal_projector.builder import build_vision_projector + +from bunny.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX + + +class BunnyMetaModel: + + def __init__(self, config): + super(BunnyMetaModel, self).__init__(config) + + # 修改这里:不要使用 if True + # 使用 hasattr 检查配置中是否包含 mm_hidden_size。 + # 1. 训练开始加载 Base Model 时,没有该属性,跳过构建(防止报错)。 + # 后续 train.py 会调用 initialize_vision_modules 手动初始化它。 + # 2. 推理加载训练好的 Bunny Model 时,Config 里有该属性,直接构建。 + + if hasattr(config, "mm_hidden_size"): + if getattr(config, 'continuous_training', False): + config.continuous_training = False + self.mm_projector = build_vision_projector(config) + + def initialize_vision_modules(self, model_args): + + pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter + + self.config.use_mm_proj = True + self.config.mm_projector_type = getattr(model_args, 'mm_projector_type') + self.config.mm_hidden_size = 1280 + + if getattr(self, 'mm_projector', None) is None: + self.mm_projector = build_vision_projector(self.config) + # else: + # In case it is frozen by LoRA + # for p in self.mm_projector.parameters(): + # p.requires_grad = True + + if pretrain_mm_mlp_adapter is not None: + mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu') + + def get_w(weights, keyword): + return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k} + + self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector')) + + +class BunnyMetaForCausalLM(ABC): + + @abstractmethod + def get_model(self): + pass + + def get_image_feature(self, embeds): + + # 传给 projector 的 image feature 形状 [batch, 1280] + + + # print('') + # print(embeds) + # print('') + + # 将 [batch, mm_hidden_size] 扩展为 [batch, seq, mm_hidden_size] + seq = 576 + embeds = embeds.unsqueeze(1).expand(-1, seq, -1) + + # embeds = self.mm_projector(embeds) + + embeds = self.get_model().mm_projector(embeds) + + # print('embeds2.shape', embeds.shape) + + # print('') + + return embeds # [batch, 1280] + + + def prepare_inputs_labels_for_multimodal( + self, input_ids, position_ids, attention_mask, past_key_values, labels, embeds + ): + + # print('<111111------------------------------------------>') + # print(embeds) + # print('<111111------------------------------------------>') + + if embeds is None or input_ids.shape[1] == 1: + if past_key_values is not None and embeds is not None and input_ids.shape[ + 1] == 1: + target_shape = past_key_values[-1][-1].shape[-2] + 1 + attention_mask = torch.cat((attention_mask, torch.ones( + (attention_mask.shape[0], target_shape - attention_mask.shape[1]), + dtype=attention_mask.dtype, + device=attention_mask.device + )), dim=1) + position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 + return input_ids, position_ids, attention_mask, past_key_values, None, labels + + + + if embeds is not None: + + # print('') + # print(embeds) + # print('') + # concat_images = torch.cat([image for image in images], dim=0) + image_features = self.get_image_feature(embeds) # [batch, 1280] + + # print('') + # print(image_features.shape) + # print('') + + + + + # Let's just add dummy tensors if they do not exist, + # it is a headache to deal with None all the time. + # But it is not ideal, and if you have a better idea, + # please open an issue / submit a PR, thanks. + _labels = labels + _position_ids = position_ids + _attention_mask = attention_mask + if attention_mask is None: + attention_mask = torch.ones_like(input_ids, dtype=torch.bool) + else: + attention_mask = attention_mask.bool() + if position_ids is None: + position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device) + if labels is None: + labels = torch.full_like(input_ids, IGNORE_INDEX) + + input_ids_temp = input_ids # points to the actual input_ids tensor + + # remove the padding using attention_mask -- TODO: double check + input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in + zip(input_ids, attention_mask)] + labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)] + + # -- TODO: better implementation? + # replace IMAGE_TOKEN_INDEX(-200) with 0 to be compatible with repetition penalty + input_ids_temp[input_ids_temp == IMAGE_TOKEN_INDEX] = 0 + + new_input_embeds = [] + new_labels = [] + cur_image_idx = 0 + for batch_idx, cur_input_ids in enumerate(input_ids): + num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum() + if num_images == 0: + cur_image_features = image_features[cur_image_idx] + cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids) + cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0) + new_input_embeds.append(cur_input_embeds) + new_labels.append(labels[batch_idx]) + cur_image_idx += 1 + continue + + image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [ + cur_input_ids.shape[0]] + cur_input_ids_noim = [] + cur_labels = labels[batch_idx] + cur_labels_noim = [] + for i in range(len(image_token_indices) - 1): + cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1:image_token_indices[i + 1]]) + cur_labels_noim.append(cur_labels[image_token_indices[i] + 1:image_token_indices[i + 1]]) + split_sizes = [x.shape[0] for x in cur_labels_noim] + cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim)) + cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0) + cur_new_input_embeds = [] + cur_new_labels = [] + + for i in range(num_images + 1): + cur_new_input_embeds.append(cur_input_embeds_no_im[i]) + cur_new_labels.append(cur_labels_noim[i]) + if i < num_images: + cur_image_features = image_features[cur_image_idx] + cur_image_idx += 1 + cur_new_input_embeds.append(cur_image_features) + cur_new_labels.append( + torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, + dtype=cur_labels.dtype)) + + cur_new_input_embeds = torch.cat(cur_new_input_embeds) + cur_new_labels = torch.cat(cur_new_labels) + + new_input_embeds.append(cur_new_input_embeds) + new_labels.append(cur_new_labels) + + # Truncate sequences to max length as image embeddings can make the sequence longer + tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None) + if tokenizer_model_max_length is not None: + new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds] + new_labels = [x[:tokenizer_model_max_length] for x in new_labels] + + # Combine them + max_len = max(x.shape[0] for x in new_input_embeds) + batch_size = len(new_input_embeds) + + new_input_embeds_padded = [] + new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, + device=new_labels[0].device) + attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device) + position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device) + + for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)): + cur_len = cur_new_embed.shape[0] + if getattr(self.config, 'tokenizer_padding_side', 'right') == "left": + new_input_embeds_padded.append(torch.cat(( + torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, + device=cur_new_embed.device), + cur_new_embed + ), dim=0)) + if cur_len > 0: + new_labels_padded[i, -cur_len:] = cur_new_labels + attention_mask[i, -cur_len:] = True + position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, + device=position_ids.device) + else: + new_input_embeds_padded.append(torch.cat(( + cur_new_embed, + torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, + device=cur_new_embed.device) + ), dim=0)) + if cur_len > 0: + new_labels_padded[i, :cur_len] = cur_new_labels + attention_mask[i, :cur_len] = True + position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, + device=position_ids.device) + + new_input_embeds = torch.stack(new_input_embeds_padded, dim=0) + + if _labels is None: + new_labels = None + else: + new_labels = new_labels_padded + + if _attention_mask is None: + attention_mask = None + else: + attention_mask = attention_mask.to(dtype=_attention_mask.dtype) + + if _position_ids is None: + position_ids = None + + return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels diff --git a/Unicorn_dense/bunny/model/language_model/__init__.py b/Unicorn_dense/bunny/model/language_model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Unicorn_dense/bunny/model/language_model/__pycache__/__init__.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b039b2aca8c492ca376a43390b8b4edf34179478 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/__pycache__/__init__.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_llama.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_llama.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e985186ff061f3623b80b8ccaec1da32a8197579 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_llama.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_minicpm.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_minicpm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57a7c05051219432d3c91572501ab1780ad90bbd Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_minicpm.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_phi.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_phi.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9448f0c76a2a5779ec01071a9ab54e7cb3b39bf5 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_phi.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_phi3.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_phi3.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5547fb62d9439dd1631282773ba68e423b226e6a Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_phi3.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_qwen.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_qwen.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eba018f03b914047a084552c16f49c72325cde50 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_qwen.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_stablelm.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_stablelm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8637d7e84613e677589d76423644643bb90deb1 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/__pycache__/bunny_stablelm.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/bunny_llama.py b/Unicorn_dense/bunny/model/language_model/bunny_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..ad6a30385bb8fe544d3631b998e36a2981e66f96 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/bunny_llama.py @@ -0,0 +1,103 @@ +from typing import List, Optional, Tuple, Union +import os +import pickle +import torch +import torch.nn as nn +from transformers import AutoConfig, AutoModelForCausalLM + +from .llama import LlamaModel, LlamaConfig, LlamaForCausalLM + +from transformers.modeling_outputs import CausalLMOutputWithPast + +from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM + + +class BunnyLlamaConfig(LlamaConfig): + model_type = "bunny-llama" + + +class BunnyLlamaModel(BunnyMetaModel, LlamaModel): + config_class = BunnyLlamaConfig + + def __init__(self, config: LlamaConfig): + super(BunnyLlamaModel, self).__init__(config) + + +class BunnyLlamaForCausalLM(LlamaForCausalLM, BunnyMetaForCausalLM): + config_class = BunnyLlamaConfig + + def __init__(self, config): + super(LlamaForCausalLM, self).__init__(config) + self.model = BunnyLlamaModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_model(self): + return self.model + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + embeds: Optional[list] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + if inputs_embeds is None: + ( + input_ids, + position_ids, + attention_mask, + past_key_values, + inputs_embeds, + labels + ) = self.prepare_inputs_labels_for_multimodal( + input_ids, + position_ids, + attention_mask, + past_key_values, + labels, + embeds + ) + + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=None + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None, + **kwargs): + embeds = kwargs.pop("embeds", None) + + _inputs = super().prepare_inputs_for_generation( + input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask, + **kwargs + ) + + if embeds is not None: + _inputs['embeds'] = embeds + + return _inputs + + +AutoConfig.register("bunny-llama", BunnyLlamaConfig) +AutoModelForCausalLM.register(BunnyLlamaConfig, BunnyLlamaForCausalLM) diff --git a/Unicorn_dense/bunny/model/language_model/bunny_minicpm.py b/Unicorn_dense/bunny/model/language_model/bunny_minicpm.py new file mode 100644 index 0000000000000000000000000000000000000000..4deb84fa3480cff693e24bbc5f6c7863461b9eb7 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/bunny_minicpm.py @@ -0,0 +1,103 @@ +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from transformers import AutoConfig, AutoModelForCausalLM + +from bunny.model.language_model.minicpm.modeling_minicpm import MiniCPMModel, MiniCPMForCausalLM +from bunny.model.language_model.minicpm.configuration_minicpm import MiniCPMConfig + +from transformers.modeling_outputs import CausalLMOutputWithPast + +from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM + + +class BunnyMiniCPMConfig(MiniCPMConfig): + model_type = "bunny-minicpm" + + +class BunnyMiniCPMModel(BunnyMetaModel, MiniCPMModel): + config_class = BunnyMiniCPMConfig + + def __init__(self, config: MiniCPMConfig): + super(BunnyMiniCPMModel, self).__init__(config) + + +class BunnyMiniCPMForCausalLM(MiniCPMForCausalLM, BunnyMetaForCausalLM): + config_class = BunnyMiniCPMConfig + + def __init__(self, config): + super(MiniCPMForCausalLM, self).__init__(config) + self.model = BunnyMiniCPMModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_model(self): + return self.model + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + images: Optional[torch.FloatTensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + + if inputs_embeds is None: + ( + input_ids, + position_ids, + attention_mask, + past_key_values, + inputs_embeds, + labels + ) = self.prepare_inputs_labels_for_multimodal( + input_ids, + position_ids, + attention_mask, + past_key_values, + labels, + images + ) + if inputs_embeds is not None: + inputs_embeds *= self.get_model().config.scale_emb + + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None, + **kwargs): + images = kwargs.pop("images", None) + + _inputs = super().prepare_inputs_for_generation( + input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask, + **kwargs + ) + + if images is not None: + _inputs['images'] = images + return _inputs + + +AutoConfig.register("bunny-minicpm", BunnyMiniCPMConfig) +AutoModelForCausalLM.register(BunnyMiniCPMConfig, BunnyMiniCPMForCausalLM) diff --git a/Unicorn_dense/bunny/model/language_model/bunny_phi.py b/Unicorn_dense/bunny/model/language_model/bunny_phi.py new file mode 100644 index 0000000000000000000000000000000000000000..fddb262df36aae2c36767c2afe980b59cc26568e --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/bunny_phi.py @@ -0,0 +1,100 @@ +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from transformers import AutoConfig, AutoModelForCausalLM + +from .phi import PhiModel, PhiConfig, PhiForCausalLM + +from transformers.modeling_outputs import CausalLMOutputWithPast + +from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM + + +class BunnyPhiConfig(PhiConfig): + model_type = "bunny-phi" + + +class BunnyPhiModel(BunnyMetaModel, PhiModel): + config_class = BunnyPhiConfig + + def __init__(self, config: PhiConfig): + super(BunnyPhiModel, self).__init__(config) + + +class BunnyPhiForCausalLM(PhiForCausalLM, BunnyMetaForCausalLM): + config_class = BunnyPhiConfig + + def __init__(self, config): + super(PhiForCausalLM, self).__init__(config) + self.model = BunnyPhiModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_model(self): + return self.model + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + images: Optional[torch.FloatTensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + + if inputs_embeds is None: + ( + input_ids, + position_ids, + attention_mask, + past_key_values, + inputs_embeds, + labels + ) = self.prepare_inputs_labels_for_multimodal( + input_ids, + position_ids, + attention_mask, + past_key_values, + labels, + images + ) + + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None, + **kwargs): + images = kwargs.pop("images", None) + + _inputs = super().prepare_inputs_for_generation( + input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask, + **kwargs + ) + + if images is not None: + _inputs['images'] = images + return _inputs + + +AutoConfig.register("bunny-phi", BunnyPhiConfig) +AutoModelForCausalLM.register(BunnyPhiConfig, BunnyPhiForCausalLM) diff --git a/Unicorn_dense/bunny/model/language_model/bunny_phi3.py b/Unicorn_dense/bunny/model/language_model/bunny_phi3.py new file mode 100644 index 0000000000000000000000000000000000000000..343b089b19288a66b18ae8784ce04ba710c2e67f --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/bunny_phi3.py @@ -0,0 +1,100 @@ +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from transformers import AutoConfig, AutoModelForCausalLM + +from .phi3 import Phi3Model, Phi3Config, Phi3ForCausalLM + +from transformers.modeling_outputs import CausalLMOutputWithPast + +from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM + + +class BunnyPhi3Config(Phi3Config): + model_type = "bunny-phi3" + + +class BunnyPhi3Model(BunnyMetaModel, Phi3Model): + config_class = BunnyPhi3Config + + def __init__(self, config: Phi3Config): + super(BunnyPhi3Model, self).__init__(config) + + +class BunnyPhi3ForCausalLM(Phi3ForCausalLM, BunnyMetaForCausalLM): + config_class = BunnyPhi3Config + + def __init__(self, config): + super(Phi3ForCausalLM, self).__init__(config) + self.model = BunnyPhi3Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_model(self): + return self.model + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + images: Optional[torch.FloatTensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + + if inputs_embeds is None: + ( + input_ids, + position_ids, + attention_mask, + past_key_values, + inputs_embeds, + labels + ) = self.prepare_inputs_labels_for_multimodal( + input_ids, + position_ids, + attention_mask, + past_key_values, + labels, + images + ) + + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None, + **kwargs): + images = kwargs.pop("images", None) + + _inputs = super().prepare_inputs_for_generation( + input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask, + **kwargs + ) + + if images is not None: + _inputs['images'] = images + return _inputs + + +AutoConfig.register("bunny-phi3", BunnyPhi3Config) +AutoModelForCausalLM.register(BunnyPhi3Config, BunnyPhi3ForCausalLM) diff --git a/Unicorn_dense/bunny/model/language_model/bunny_qwen.py b/Unicorn_dense/bunny/model/language_model/bunny_qwen.py new file mode 100644 index 0000000000000000000000000000000000000000..ba67a00a72256514120e3627dd8ffa8f39068414 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/bunny_qwen.py @@ -0,0 +1,100 @@ +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from transformers import AutoConfig, AutoModelForCausalLM + +from .qwen2 import Qwen2Model, Qwen2Config, Qwen2ForCausalLM + +from transformers.modeling_outputs import CausalLMOutputWithPast + +from ..bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM + + +class BunnyQwen2Config(Qwen2Config): + model_type = "bunny-qwen2" + + +class BunnyQwen2Model(BunnyMetaModel, Qwen2Model): + config_class = BunnyQwen2Config + + def __init__(self, config: Qwen2Config): + super(BunnyQwen2Model, self).__init__(config) + + +class BunnyQwen2ForCausalLM(Qwen2ForCausalLM, BunnyMetaForCausalLM): + config_class = BunnyQwen2Config + + def __init__(self, config): + super(Qwen2ForCausalLM, self).__init__(config) + self.model = BunnyQwen2Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_model(self): + return self.model + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + images: Optional[torch.FloatTensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + + if inputs_embeds is None: + ( + input_ids, + position_ids, + attention_mask, + past_key_values, + inputs_embeds, + labels + ) = self.prepare_inputs_labels_for_multimodal( + input_ids, + position_ids, + attention_mask, + past_key_values, + labels, + images + ) + + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None, + **kwargs): + images = kwargs.pop("images", None) + + _inputs = super().prepare_inputs_for_generation( + input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask, + **kwargs + ) + + if images is not None: + _inputs['images'] = images + return _inputs + + +AutoConfig.register("bunny-qwen2", BunnyQwen2Config) +AutoModelForCausalLM.register(BunnyQwen2Config, BunnyQwen2ForCausalLM) diff --git a/Unicorn_dense/bunny/model/language_model/bunny_stablelm.py b/Unicorn_dense/bunny/model/language_model/bunny_stablelm.py new file mode 100644 index 0000000000000000000000000000000000000000..4edcb0ad574fe675ba3bcf4aed7aad0c94dad255 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/bunny_stablelm.py @@ -0,0 +1,100 @@ +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from transformers import AutoConfig, AutoModelForCausalLM + +from bunny.model.language_model.stable_lm.modeling_stablelm_epoch import StableLMEpochModel, StableLMEpochConfig, \ + StableLMEpochForCausalLM + +from transformers.modeling_outputs import CausalLMOutputWithPast + +from bunny.model.bunny_arch import BunnyMetaModel, BunnyMetaForCausalLM + + +class BunnyStableLMConfig(StableLMEpochConfig): + model_type = "bunny-stablelm" + + +class BunnyStableLMModel(BunnyMetaModel, StableLMEpochModel): + config_class = BunnyStableLMConfig + + def __init__(self, config: StableLMEpochConfig): + super(BunnyStableLMModel, self).__init__(config) + + +class BunnyStableLMForCausalLM(StableLMEpochForCausalLM, BunnyMetaForCausalLM): + config_class = BunnyStableLMConfig + + def __init__(self, config): + super(StableLMEpochForCausalLM, self).__init__(config) + self.model = BunnyStableLMModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_model(self): + return self.model + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + images: Optional[torch.FloatTensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + if inputs_embeds is None: + ( + input_ids, + position_ids, + attention_mask, + past_key_values, + inputs_embeds, + labels + ) = self.prepare_inputs_labels_for_multimodal( + input_ids, + position_ids, + attention_mask, + past_key_values, + labels, + images + ) + + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None, + **kwargs): + images = kwargs.pop("images", None) + + _inputs = super().prepare_inputs_for_generation( + input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask, + **kwargs + ) + + if images is not None: + _inputs['images'] = images + return _inputs + + +AutoConfig.register("bunny-stablelm", BunnyStableLMConfig) +AutoModelForCausalLM.register(BunnyStableLMConfig, BunnyStableLMForCausalLM) diff --git a/Unicorn_dense/bunny/model/language_model/llama/__init__.py b/Unicorn_dense/bunny/model/language_model/llama/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5d36246dd8ebe52e7badf1c9f3ca397a65497dba --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/llama/__init__.py @@ -0,0 +1,114 @@ +# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from transformers.utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_flax_available, + is_sentencepiece_available, + is_tokenizers_available, + is_torch_available, +) + + +_import_structure = { + "configuration_llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlamaConfig"], +} + +try: + if not is_sentencepiece_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["tokenization_llama"] = ["LlamaTokenizer"] + +try: + if not is_tokenizers_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["tokenization_llama_fast"] = ["LlamaTokenizerFast"] + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_llama"] = [ + "LlamaForCausalLM", + "LlamaModel", + "LlamaPreTrainedModel", + "LlamaForSequenceClassification", + "LlamaForQuestionAnswering", + ] + +try: + if not is_flax_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_flax_llama"] = ["FlaxLlamaForCausalLM", "FlaxLlamaModel", "FlaxLlamaPreTrainedModel"] + + +if TYPE_CHECKING: + from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig + + try: + if not is_sentencepiece_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .tokenization_llama import LlamaTokenizer + + try: + if not is_tokenizers_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .tokenization_llama_fast import LlamaTokenizerFast + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_llama import ( + LlamaForCausalLM, + LlamaForQuestionAnswering, + LlamaForSequenceClassification, + LlamaModel, + LlamaPreTrainedModel, + ) + + try: + if not is_flax_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel, FlaxLlamaPreTrainedModel + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/Unicorn_dense/bunny/model/language_model/llama/__pycache__/__init__.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/llama/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a10ebb42ea2c5b90904e535480f1934b2f48fab Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/llama/__pycache__/__init__.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/llama/__pycache__/configuration_llama.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/llama/__pycache__/configuration_llama.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..05e972015599338293c9c82739adb320f539e742 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/llama/__pycache__/configuration_llama.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/llama/__pycache__/modeling_llama.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/llama/__pycache__/modeling_llama.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2bc2b94c53e0a47f38a0bbee879f16ab217e0ccb Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/llama/__pycache__/modeling_llama.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/llama/configuration_llama.py b/Unicorn_dense/bunny/model/language_model/llama/configuration_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..29cab2b8a27ef943877c863b819196036a91bcd9 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/llama/configuration_llama.py @@ -0,0 +1,191 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" LLaMA model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + + +# from ..deprecated._archive_maps import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP # noqa: F401, E402 + + +class LlamaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the LLaMA-7B. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`LlamaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens, + Llama 2 up to 4096, CodeLlama up to 16384. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to understand more about it. This value is + necessary to ensure exact reproducibility of the pretraining results. Please refer to [this + issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import LlamaModel, LlamaConfig + + >>> # Initializing a LLaMA llama-7b style configuration + >>> configuration = LlamaConfig() + + >>> # Initializing a model from the llama-7b style configuration + >>> model = LlamaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "llama" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") diff --git a/Unicorn_dense/bunny/model/language_model/llama/modeling_llama.py b/Unicorn_dense/bunny/model/language_model/llama/modeling_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..b27c342829cad23e14c8254c63a75e53ab24bf5b --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/llama/modeling_llama.py @@ -0,0 +1,1844 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch LLaMA model.""" + +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache, StaticCache +# from transformers.modeling_attn_mask_utils import AttentionMaskConverter +from dataclasses import dataclass +@dataclass +class AttentionMaskConverter: + """ + A utility attention mask class that allows one to: + - Create a causal 4d mask + - Create a causal 4d mask with slided window + - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length, + key_value_length) that can be multiplied with attention scores + + Examples: + + ```python + >>> import torch + >>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter + + >>> converter = AttentionMaskConverter(True) + >>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32) + tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38], + [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38], + [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38], + [-3.4028e+38, -3.4028e+38, -3.4028e+38, 0.0000e+00, -3.4028e+38], + [-3.4028e+38, -3.4028e+38, -3.4028e+38, 0.0000e+00, 0.0000e+00]]]]) + ``` + + Parameters: + is_causal (`bool`): + Whether the attention mask should be a uni-directional (causal) or bi-directional mask. + + sliding_window (`int`, *optional*): + Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer. + """ + + is_causal: bool + sliding_window: int + + def __init__(self, is_causal: bool, sliding_window: Optional[int] = None): + self.is_causal = is_causal + self.sliding_window = sliding_window + + if self.sliding_window is not None and self.sliding_window <= 0: + raise ValueError( + f"Make sure that when passing `sliding_window` that its value is a strictly positive integer, not `{self.sliding_window}`" + ) + + def to_causal_4d( + self, + batch_size: int, + query_length: int, + key_value_length: int, + dtype: torch.dtype, + device: Union[torch.device, "str"] = "cpu", + ) -> Optional[torch.Tensor]: + """ + Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative + bias to upper right hand triangular matrix (causal mask). + """ + if not self.is_causal: + raise ValueError(f"Please use `to_causal_4d` only if {self.__class__} has `is_causal` set to True.") + + # If shape is not cached, create a new causal mask and cache it + input_shape = (batch_size, query_length) + past_key_values_length = key_value_length - query_length + + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + causal_4d_mask = None + if input_shape[-1] > 1 or self.sliding_window is not None: + causal_4d_mask = self._make_causal_mask( + input_shape, + dtype, + device=device, + past_key_values_length=past_key_values_length, + sliding_window=self.sliding_window, + ) + + return causal_4d_mask + + def to_4d( + self, + attention_mask_2d: torch.Tensor, + query_length: int, + dtype: torch.dtype, + key_value_length: Optional[int] = None, + ) -> torch.Tensor: + """ + Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length, + key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is + causal, a causal mask will be added. + """ + input_shape = (attention_mask_2d.shape[0], query_length) + + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + causal_4d_mask = None + if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal: + if key_value_length is None: + raise ValueError( + "This attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask." + ) + + past_key_values_length = key_value_length - query_length + causal_4d_mask = self._make_causal_mask( + input_shape, + dtype, + device=attention_mask_2d.device, + past_key_values_length=past_key_values_length, + sliding_window=self.sliding_window, + ) + elif self.sliding_window is not None: + raise NotImplementedError("Sliding window is currently only implemented for causal masking") + + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = self._expand_mask(attention_mask_2d, dtype, tgt_len=input_shape[-1]).to( + attention_mask_2d.device + ) + + if causal_4d_mask is not None: + expanded_attn_mask = causal_4d_mask.masked_fill(expanded_attn_mask.bool(), torch.finfo(dtype).min) + + # expanded_attn_mask + causal_4d_mask can cause some overflow + expanded_4d_mask = expanded_attn_mask + + return expanded_4d_mask + + @staticmethod + def _make_causal_mask( + input_ids_shape: torch.Size, + dtype: torch.dtype, + device: torch.device, + past_key_values_length: int = 0, + sliding_window: Optional[int] = None, + ): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + + # add lower triangular sliding window mask if necessary + if sliding_window is not None: + diagonal = past_key_values_length - sliding_window - 1 + + context_mask = torch.tril(torch.ones_like(mask, dtype=torch.bool), diagonal=diagonal) + mask.masked_fill_(context_mask, torch.finfo(dtype).min) + + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + @staticmethod + def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + @staticmethod + def _unmask_unattended( + expanded_mask: torch.FloatTensor, + min_dtype: float, + ): + # fmt: off + """ + Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when + using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + Details: https://github.com/pytorch/pytorch/issues/110213 + + `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len]. + `attention_mask` is [bsz, src_seq_len]. + + The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias. + + For example, if `expanded_mask` is (e.g. here left-padding case) + ``` + [[[[0, 0, 0], + [0, 0, 0], + [0, 0, 1]]], + [[[1, 0, 0], + [1, 1, 0], + [1, 1, 1]]], + [[[0, 0, 0], + [0, 1, 0], + [0, 1, 1]]]] + ``` + then the modified `expanded_mask` will be + ``` + [[[[1, 1, 1], <-- modified + [1, 1, 1], <-- modified + [0, 0, 1]]], + [[[1, 0, 0], + [1, 1, 0], + [1, 1, 1]]], + [[[1, 1, 1], <-- modified + [0, 1, 0], + [0, 1, 1]]]] + ``` + """ + # fmt: on + if expanded_mask.dtype == torch.bool: + raise ValueError( + "AttentionMaskConverter._unmask_unattended expects a float `expanded_mask`, got a BoolTensor." + ) + + return expanded_mask.mul(~torch.all(expanded_mask == min_dtype, dim=-1, keepdim=True)) + + @staticmethod + def _ignore_causal_mask_sdpa( + attention_mask: Optional[torch.Tensor], + inputs_embeds: torch.Tensor, + past_key_values_length: int, + sliding_window: Optional[int] = None, + ) -> bool: + """ + Detects whether the optional user-specified attention_mask & the automatically created causal mask can be ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument. + + In case no token is masked in the `attention_mask` argument, if `query_length == 1` or + `key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks, + allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed). + """ + + batch_size, query_length = inputs_embeds.shape[0], inputs_embeds.shape[1] + key_value_length = query_length + past_key_values_length + + is_tracing = ( + torch.jit.is_tracing() + or isinstance(inputs_embeds, torch.fx.Proxy) + or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling()) + ) + + ignore_causal_mask = False + + if attention_mask is None: + # TODO: When tracing with TorchDynamo with fullgraph=True, the model is recompiled depending on the input shape, thus SDPA's `is_causal` argument is rightfully updated (see https://gist.github.com/fxmarty/1313f39037fc1c112508989628c57363). However, when using `torch.export` or + # or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is hard-coded. If a user exports a model with q_len > 1, the exported model will hard-code `is_causal=True` which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108). + # Thus, we currently can NOT set `ignore_causal_mask = True` here. We would need a `torch._dynamo.is_exporting()` flag. + # + # Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal` (`TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor`). + if ( + not is_tracing + and (query_length == 1 or key_value_length == query_length) + and (sliding_window is None or key_value_length < sliding_window) + ): + ignore_causal_mask = True + elif sliding_window is None or key_value_length < sliding_window: + if len(attention_mask.shape) == 4: + expected_shape = (batch_size, 1, query_length, key_value_length) + if tuple(attention_mask.shape) != expected_shape: + raise ValueError( + f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}." + ) + elif not is_tracing and torch.all(attention_mask == 1): + if query_length == 1 or key_value_length == query_length: + # For query_length == 1, causal attention and bi-directional attention are the same. + ignore_causal_mask = True + + # Unfortunately, for query_length > 1 and key_value_length != query_length, we cannot generally ignore the attention mask, as SDPA causal mask generation + # may be wrong. We will set `is_causal=False` in SDPA and rely on Transformers attention_mask instead, hence not setting it to None here. + # Reference: https://github.com/pytorch/pytorch/issues/108108 + # TODO: maybe revisit this with https://github.com/pytorch/pytorch/pull/114823 in PyTorch 2.3. + + return ignore_causal_mask + + +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + QuestionAnsweringModelOutput, + SequenceClassifierOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_llama import LlamaConfig + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "LlamaConfig" + + +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +class LlamaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + LlamaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm) + + +class LlamaRotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + super().__init__() + self.scaling_factor = scaling_factor + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + # For BC we register cos and sin cached + self.max_seq_len_cached = max_position_embeddings + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + t = t / self.scaling_factor + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("_cos_cached", emb.cos().to(torch.get_default_dtype()), persistent=False) + self.register_buffer("_sin_cached", emb.sin().to(torch.get_default_dtype()), persistent=False) + + @property + def sin_cached(self): + logger.warning_once( + "The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use " + "the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class" + ) + return self._sin_cached + + @property + def cos_cached(self): + logger.warning_once( + "The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use " + "the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class" + ) + return self._cos_cached + + @torch.no_grad() + def forward(self, x, position_ids): + # x: [bs, num_attention_heads, seq_len, head_size] + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding): + """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def forward(self, x, position_ids): + # difference to the original RoPE: a scaling factor is aplied to the position ids + position_ids = position_ids.float() / self.scaling_factor + cos, sin = super().forward(x, position_ids) + return cos, sin + + +class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding): + """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def forward(self, x, position_ids): + # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length + seq_len = torch.max(position_ids) + 1 + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / ( + base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(x.device) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: this may break with compilation + + cos, sin = super().forward(x, position_ids) + return cos, sin + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class LlamaMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + if self.config.pretraining_tp > 1: + slice = self.intermediate_size // self.config.pretraining_tp + gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) + up_proj_slices = self.up_proj.weight.split(slice, dim=0) + down_proj_slices = self.down_proj.weight.split(slice, dim=1) + + gate_proj = torch.cat( + [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1 + ) + up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1) + + intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) + down_proj = [ + F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp) + ] + down_proj = sum(down_proj) + else: + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + return down_proj + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class LlamaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = LlamaRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = LlamaLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + if self.config.pretraining_tp > 1: + key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp + query_slices = self.q_proj.weight.split( + (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 + ) + key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) + value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) + + query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] + query_states = torch.cat(query_states, dim=-1) + + key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] + key_states = torch.cat(key_states, dim=-1) + + value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] + value_states = torch.cat(value_states, dim=-1) + + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + past_key_value = getattr(self, "past_key_value", past_key_value) + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + if self.config.pretraining_tp > 1: + attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) + o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) + attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) + else: + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class LlamaFlashAttention2(LlamaAttention): + """ + Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + past_key_value = getattr(self, "past_key_value", past_key_value) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (LlamaRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +class LlamaSdpaAttention(LlamaAttention): + """ + Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from LlamaAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + # In case static cache is used, it is an instance attribute. + past_key_value = getattr(self, "past_key_value", past_key_value) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and causal_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + # In case we are not compiling, we may set `causal_mask` to None, which is required to dispatch to SDPA's Flash Attention 2 backend, rather + # relying on the `is_causal` argument. + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=causal_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +LLAMA_ATTENTION_CLASSES = { + "eager": LlamaAttention, + "flash_attention_2": LlamaFlashAttention2, + "sdpa": LlamaSdpaAttention, +} + + +class LlamaDecoderLayer(nn.Module): + def __init__(self, config: LlamaConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) + + self.mlp = LlamaMLP(config) + self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +LLAMA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`LlamaConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", + LLAMA_START_DOCSTRING, +) +class LlamaPreTrainedModel(PreTrainedModel): + config_class = LlamaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["LlamaDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _setup_cache(self, cache_cls, max_batch_size, max_cache_len: Optional[int] = None): + if self.config._attn_implementation == "flash_attention_2" and cache_cls == StaticCache: + raise ValueError( + "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` " + "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers" + ) + + for layer in self.model.layers: + device = layer.input_layernorm.weight.device + if hasattr(self.config, "_pre_quantization_dtype"): + dtype = self.config._pre_quantization_dtype + else: + dtype = layer.self_attn.o_proj.weight.dtype + layer.self_attn.past_key_value = cache_cls( + self.config, max_batch_size, max_cache_len, device=device, dtype=dtype + ) + + def _reset_cache(self): + for layer in self.model.layers: + layer.self_attn.past_key_value = None + + +LLAMA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", + LLAMA_START_DOCSTRING, +) +class LlamaModel(LlamaPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`] + + Args: + config: LlamaConfig + """ + + def __init__(self, config: LlamaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + past_seen_tokens = 0 + if use_cache: # kept for BC (cache positions) + if not isinstance(past_key_values, StaticCache): + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_seen_tokens = past_key_values.get_seq_length() + + if cache_position is None: + if isinstance(past_key_values, StaticCache): + raise ValueError("cache_position is a required argument when using StaticCache.") + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position, past_seen_tokens) + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + causal_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + cache_position, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = ( + next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache + ) + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_seen_tokens: int, + ): + # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static + # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes. + # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using + # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114 + + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + if self.config._attn_implementation == "sdpa": + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, + # in order to dispatch on Flash Attention 2. + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, inputs_embeds=input_tensor, past_key_values_length=past_seen_tokens + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + min_dtype = torch.finfo(dtype).min + sequence_length = input_tensor.shape[1] + if hasattr(getattr(self.layers[0], "self_attn", {}), "past_key_value"): # static cache + target_length = self.config.max_position_embeddings + else: # dynamic cache + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + if attention_mask.dim() == 2: + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0) + causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype) + elif attention_mask.dim() == 4: + # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with + # cache. In that case, the 4D attention mask attends to the newest tokens only. + if attention_mask.shape[-2] < cache_position[0] + sequence_length: + offset = cache_position[0] + else: + offset = 0 + mask_shape = attention_mask.shape + mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + +class LlamaForCausalLM(LlamaPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = LlamaModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, LlamaForCausalLM + + >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + if self.config.pretraining_tp > 1: + lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0) + logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)] + logits = torch.cat(logits, dim=-1) + else: + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs + ): + # With static cache, the `past_key_values` is None + # TODO joao: standardize interface for the different Cache classes and remove of this if + has_static_cache = False + if past_key_values is None: + past_key_values = getattr(getattr(self.model.layers[0], "self_attn", {}), "past_key_value", None) + has_static_cache = past_key_values is not None + + past_length = 0 + if past_key_values is not None: + if isinstance(past_key_values, Cache): + past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length() + max_cache_length = ( + torch.tensor(past_key_values.get_max_length(), device=input_ids.device) + if past_key_values.get_max_length() is not None + else None + ) + cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length) + # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + else: + remove_prefix_length = input_ids.shape[1] - 1 + input_ids = input_ids[:, remove_prefix_length:] + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114 + # TODO: use `next_tokens` directly instead. + model_inputs = {"input_ids": input_ids.contiguous()} + + input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1] + if cache_position is None: + cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device) + else: + cache_position = cache_position[-input_length:] + + if has_static_cache: + past_key_values = None + + model_inputs.update( + { + "position_ids": position_ids, + "cache_position": cache_position, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The LLaMa Model transformer with a sequence classification head on top (linear layer). + + [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + LLAMA_START_DOCSTRING, +) +class LlamaForSequenceClassification(LlamaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = LlamaModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@add_start_docstrings( + """ +The Llama Model transformer with a span classification head on top for extractive question-answering tasks like +SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + LLAMA_START_DOCSTRING, +) +class LlamaForQuestionAnswering(LlamaPreTrainedModel): + base_model_prefix = "transformer" + + # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Llama + def __init__(self, config): + super().__init__(config) + self.transformer = LlamaModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.transformer.embed_tokens + + def set_input_embeddings(self, value): + self.transformer.embed_tokens = value + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1).to(start_logits.device) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1).to(end_logits.device) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/Unicorn_dense/bunny/model/language_model/llama/tokenization_llama.py b/Unicorn_dense/bunny/model/language_model/llama/tokenization_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..a2eb7e157c5f6f076b0d30ca6c00ff24c71fb6b4 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/llama/tokenization_llama.py @@ -0,0 +1,471 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tokenization classes for LLaMA.""" +import os +from shutil import copyfile +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple + +import sentencepiece as spm + +from transformers.convert_slow_tokenizer import import_protobuf +from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer +from transformers.utils import logging + + +if TYPE_CHECKING: + from transformers.tokenization_utils_base import TextInput + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} + +SPIECE_UNDERLINE = "▁" + +B_INST, E_INST = "[INST]", "[/INST]" +B_SYS, E_SYS = "<>\n", "\n<>\n\n" + +# fmt: off +DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \ +answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\ + that your responses are socially unbiased and positive in nature. + +If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \ +correct. If you don't know the answer to a question, please don't share false information.""" +# fmt: on + + +class LlamaTokenizer(PreTrainedTokenizer): + """ + Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is + no padding token in the original model. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The end of sequence token. + pad_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by + attention mechanisms or loss computation. + sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + add_bos_token (`bool`, *optional*, defaults to `True`): + Whether or not to add an `bos_token` at the start of sequences. + add_eos_token (`bool`, *optional*, defaults to `False`): + Whether or not to add an `eos_token` at the end of sequences. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. + use_default_system_prompt (`bool`, *optional*, defaults to `False`): + Whether or not the default system prompt for Llama should be used. + spaces_between_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to add spaces between special tokens. + legacy (`bool`, *optional*): + Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622 + and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple + example: + + - `legacy=True`: + ```python + >>> from transformers import T5Tokenizer + + >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=True) + >>> tokenizer.encode("Hello .") + [8774, 32099, 3, 5, 1] + ``` + - `legacy=False`: + ```python + >>> from transformers import T5Tokenizer + + >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False) + >>> tokenizer.encode("Hello .") # the extra space `[3]` is no longer here + [8774, 32099, 5, 1] + ``` + Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details. + add_prefix_space (`bool`, *optional*, defaults to `True`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. + + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + unk_token="", + bos_token="", + eos_token="", + pad_token=None, + sp_model_kwargs: Optional[Dict[str, Any]] = None, + add_bos_token=True, + add_eos_token=False, + clean_up_tokenization_spaces=False, + use_default_system_prompt=False, + spaces_between_special_tokens=False, + legacy=None, + add_prefix_space=True, + **kwargs, + ): + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token + + if legacy is None: + logger.warning_once( + f"You are using the default legacy behaviour of the {self.__class__}. This is" + " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you." + " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it" + " means, and thoroughly read the reason why this was added as explained in" + " https://github.com/huggingface/transformers/pull/24565" + ) + legacy = True + + self.legacy = legacy + self.vocab_file = vocab_file + self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token + self.use_default_system_prompt = use_default_system_prompt + self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False)) + self.add_prefix_space = add_prefix_space + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + add_bos_token=add_bos_token, + add_eos_token=add_eos_token, + sp_model_kwargs=self.sp_model_kwargs, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + use_default_system_prompt=use_default_system_prompt, + spaces_between_special_tokens=spaces_between_special_tokens, + legacy=legacy, + add_prefix_space=add_prefix_space, + **kwargs, + ) + + @property + def unk_token_length(self): + return len(self.sp_model.encode(str(self.unk_token))) + + # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor + def get_spm_processor(self, from_slow=False): + tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs) + if self.legacy or from_slow: # no dependency on protobuf + tokenizer.Load(self.vocab_file) + return tokenizer + + with open(self.vocab_file, "rb") as f: + sp_model = f.read() + model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)") + model = model_pb2.ModelProto.FromString(sp_model) + normalizer_spec = model_pb2.NormalizerSpec() + normalizer_spec.add_dummy_prefix = False + model.normalizer_spec.MergeFrom(normalizer_spec) + sp_model = model.SerializeToString() + tokenizer.LoadFromSerializedProto(sp_model) + return tokenizer + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + state["sp_model_proto"] = self.sp_model.serialized_model_proto() + return state + + def __setstate__(self, d): + self.__dict__ = d + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.LoadFromSerializedProto(self.sp_model_proto) + + @property + def vocab_size(self): + """Returns vocab size""" + return self.sp_model.get_piece_size() + + def get_vocab(self): + """Returns vocab as a dict""" + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize + def tokenize(self, text: "TextInput", **kwargs) -> List[str]: + """ + Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the + first token is special. + """ + if self.legacy or len(text) == 0: + return super().tokenize(text, **kwargs) + + text = text.replace(SPIECE_UNDERLINE, " ") + if self.add_prefix_space: + text = SPIECE_UNDERLINE + text + + tokens = super().tokenize(text, **kwargs) + + if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens: + tokens = tokens[1:] + return tokens + + # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize + def _tokenize(self, text, **kwargs): + """ + Returns a tokenized string. + + We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any + SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give + `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the + `unk_token`. Here is an example with `unk_token = ""` and `unk_token_length = 4`. + `self.tokenizer.sp_model.encode(" Hey", out_type = str)[4:]`. + """ + tokens = self.sp_model.encode(text, out_type=str) + if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")): + return tokens + + # 1. Encode string + prefix ex: " Hey" + tokens = self.sp_model.encode(self.unk_token + text, out_type=str) + # 2. Remove self.unk_token from ['<','unk','>', '▁Hey'] + return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + # since we manually add the prefix space, we have to remove it when decoding + if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space: + tokens[0] = tokens[0][1:] + + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for i, token in enumerate(tokens): + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special and i != 0 and self.legacy: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + if prev_is_special and i == 1 and self.add_prefix_space and not token.startswith(SPIECE_UNDERLINE): + out_string += " " + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string + + def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] + + output = bos_token_id + token_ids_0 + eos_token_id + + if token_ids_1 is not None: + output = output + bos_token_id + token_ids_1 + eos_token_id + + return output + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + bos_token_id = [1] if self.add_bos_token else [] + eos_token_id = [1] if self.add_eos_token else [] + + if token_ids_1 is None: + return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + return ( + bos_token_id + + ([0] * len(token_ids_0)) + + eos_token_id + + bos_token_id + + ([0] * len(token_ids_1)) + + eos_token_id + ) + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT + sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + if token_ids_1 is None, only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of ids. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] + + output = [0] * len(bos_token_id + token_ids_0 + eos_token_id) + + if token_ids_1 is not None: + output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) + + return output + + @property + def default_chat_template(self): + """ + LLaMA uses [INST] and [/INST] to indicate user messages, and <> and <> to indicate system messages. + Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict + user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering + rather than needing special tokens. The system message is partly 'embedded' in the first user message, which + results in an unusual token ordering when it is present. This template should definitely be changed if you wish + to fine-tune a model with more flexible role ordering! + + The output should look something like: + + [INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer [INST] Prompt [/INST] Answer + [INST] Prompt [/INST] + + The reference for this chat template is [this code + snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) + in the original repository. + """ + logger.warning_once( + "\nNo chat template is defined for this tokenizer - using the default template " + f"for the {self.__class__.__name__} class. If the default is not appropriate for " + "your model, please set `tokenizer.chat_template` to an appropriate template. " + "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n" + ) + template = ( + "{% if messages[0]['role'] == 'system' %}" + "{% set loop_messages = messages[1:] %}" # Extract system message if it's present + "{% set system_message = messages[0]['content'] %}" + "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" + "{% set loop_messages = messages %}" # Or use the default system message if the flag is set + "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" + "{% else %}" + "{% set loop_messages = messages %}" + "{% set system_message = false %}" + "{% endif %}" + "{% for message in loop_messages %}" # Loop over all non-system messages + "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" + "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" + "{% endif %}" + "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message + "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" + "{% else %}" + "{% set content = message['content'] %}" + "{% endif %}" + "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way + "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" + "{% elif message['role'] == 'system' %}" + "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" + "{% elif message['role'] == 'assistant' %}" + "{{ ' ' + content.strip() + ' ' + eos_token }}" + "{% endif %}" + "{% endfor %}" + ) + template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false") + default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") + template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) + + return template diff --git a/Unicorn_dense/bunny/model/language_model/llama/tokenization_llama_fast.py b/Unicorn_dense/bunny/model/language_model/llama/tokenization_llama_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..cd3d5acf9db5b20969983341cd7a7e0c0c72ad68 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/llama/tokenization_llama_fast.py @@ -0,0 +1,281 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from shutil import copyfile +from typing import Optional, Tuple + +from tokenizers import processors + +from transformers.tokenization_utils_fast import PreTrainedTokenizerFast +from transformers.utils import is_sentencepiece_available, logging +from transformers.utils.versions import require_version + + +require_version("tokenizers>=0.13.3") + +if is_sentencepiece_available(): + from .tokenization_llama import LlamaTokenizer +else: + LlamaTokenizer = None + +logger = logging.get_logger(__name__) +VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"} + +B_INST, E_INST = "[INST]", "[/INST]" +B_SYS, E_SYS = "<>\n", "\n<>\n\n" + +# fmt: off +DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \ +answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\ + that your responses are socially unbiased and positive in nature. + +If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \ +correct. If you don't know the answer to a question, please don't share false information.""" +# fmt: on + + +class LlamaTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. + + This uses notably ByteFallback and no normalization. + + ```python + >>> from transformers import LlamaTokenizerFast + + >>> tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer") + >>> tokenizer.encode("Hello this is a test") + [1, 15043, 445, 338, 263, 1243] + ``` + + If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or + call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the + values of the first token and final token of an encoded sequence will not be correct). For more details, checkout + [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation. + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`, *optional*): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that + contains the vocabulary necessary to instantiate a tokenizer. + tokenizer_file (`str`, *optional*): + [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that + contains everything needed to load the tokenizer. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. + unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The end of sequence token. + add_bos_token (`bool`, *optional*, defaults to `True`): + Whether or not to add an `bos_token` at the start of sequences. + add_eos_token (`bool`, *optional*, defaults to `False`): + Whether or not to add an `eos_token` at the end of sequences. + use_default_system_prompt (`bool`, *optional*, defaults to `False`): + Whether or not the default system prompt for Llama should be used. + add_prefix_space (`bool`, *optional*): + Whether or not the tokenizer should automatically add a prefix space + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = LlamaTokenizer + padding_side = "left" + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + clean_up_tokenization_spaces=False, + unk_token="", + bos_token="", + eos_token="", + add_bos_token=True, + add_eos_token=False, + use_default_system_prompt=False, + add_prefix_space=None, + **kwargs, + ): + if add_prefix_space is not None: + logger.warning_once( + "You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers" + ) + kwargs["from_slow"] = True + + super().__init__( + vocab_file=vocab_file, + tokenizer_file=tokenizer_file, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + add_bos_token=add_bos_token, + add_eos_token=add_eos_token, + use_default_system_prompt=use_default_system_prompt, + **kwargs, + ) + self._add_bos_token = add_bos_token + self._add_eos_token = add_eos_token + self.update_post_processor() + self.use_default_system_prompt = use_default_system_prompt + self.vocab_file = vocab_file + + @property + def can_save_slow_tokenizer(self) -> bool: + return os.path.isfile(self.vocab_file) if self.vocab_file else False + + def update_post_processor(self): + """ + Updates the underlying post processor with the current `bos_token` and `eos_token`. + """ + bos = self.bos_token + bos_token_id = self.bos_token_id + if bos is None and self.add_bos_token: + raise ValueError("add_bos_token = True but bos_token = None") + + eos = self.eos_token + eos_token_id = self.eos_token_id + if eos is None and self.add_eos_token: + raise ValueError("add_eos_token = True but eos_token = None") + + single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}" + pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}" + + special_tokens = [] + if self.add_bos_token: + special_tokens.append((bos, bos_token_id)) + if self.add_eos_token: + special_tokens.append((eos, eos_token_id)) + self._tokenizer.post_processor = processors.TemplateProcessing( + single=single, pair=pair, special_tokens=special_tokens + ) + + @property + def add_eos_token(self): + return self._add_eos_token + + @property + def add_bos_token(self): + return self._add_bos_token + + @add_eos_token.setter + def add_eos_token(self, value): + self._add_eos_token = value + self.update_post_processor() + + @add_bos_token.setter + def add_bos_token(self, value): + self._add_bos_token = value + self.update_post_processor() + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + @property + # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template + def default_chat_template(self): + """ + LLaMA uses [INST] and [/INST] to indicate user messages, and <> and <> to indicate system messages. + Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict + user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering + rather than needing special tokens. The system message is partly 'embedded' in the first user message, which + results in an unusual token ordering when it is present. This template should definitely be changed if you wish + to fine-tune a model with more flexible role ordering! + + The output should look something like: + + [INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer [INST] Prompt [/INST] Answer + [INST] Prompt [/INST] + + The reference for this chat template is [this code + snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) + in the original repository. + """ + logger.warning_once( + "\nNo chat template is defined for this tokenizer - using the default template " + f"for the {self.__class__.__name__} class. If the default is not appropriate for " + "your model, please set `tokenizer.chat_template` to an appropriate template. " + "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n" + ) + template = ( + "{% if messages[0]['role'] == 'system' %}" + "{% set loop_messages = messages[1:] %}" # Extract system message if it's present + "{% set system_message = messages[0]['content'] %}" + "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" + "{% set loop_messages = messages %}" # Or use the default system message if the flag is set + "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" + "{% else %}" + "{% set loop_messages = messages %}" + "{% set system_message = false %}" + "{% endif %}" + "{% for message in loop_messages %}" # Loop over all non-system messages + "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" + "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" + "{% endif %}" + "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message + "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" + "{% else %}" + "{% set content = message['content'] %}" + "{% endif %}" + "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way + "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" + "{% elif message['role'] == 'system' %}" + "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" + "{% elif message['role'] == 'assistant' %}" + "{{ ' ' + content.strip() + ' ' + eos_token }}" + "{% endif %}" + "{% endfor %}" + ) + template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false") + default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") + template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) + + return template + + # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers + # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] + + output = bos_token_id + token_ids_0 + eos_token_id + + if token_ids_1 is not None: + output = output + bos_token_id + token_ids_1 + eos_token_id + + return output diff --git a/Unicorn_dense/bunny/model/language_model/minicpm/__pycache__/configuration_minicpm.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/minicpm/__pycache__/configuration_minicpm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a61fbe84e6187921916a8c8a99505dd5fbcad99 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/minicpm/__pycache__/configuration_minicpm.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/minicpm/__pycache__/modeling_minicpm.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/minicpm/__pycache__/modeling_minicpm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..caec93d61ca390f4737d5c997cfd45f04b62e946 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/minicpm/__pycache__/modeling_minicpm.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/minicpm/configuration_minicpm.py b/Unicorn_dense/bunny/model/language_model/minicpm/configuration_minicpm.py new file mode 100644 index 0000000000000000000000000000000000000000..21eb158572ad0cfbfdf51072106d5018c2ff8ca5 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/minicpm/configuration_minicpm.py @@ -0,0 +1,202 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" MiniCPM model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +MINICPM_PRETRAINED_CONFIG_ARCHIVE_MAP = {} + + +class MiniCPMConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MiniCPMModel`]. It is used to instantiate an MiniCPM + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the MiniCPM-7B. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the MiniCPM model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`MiniCPMModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. MiniCPM 1 supports up to 2048 tokens, + MiniCPM 2 up to 4096, CodeMiniCPM up to 16384. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is + necessary to ensure exact reproducibility of the pretraining results. Please refer to [this + issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalMiniCPM/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import MiniCPMModel, MiniCPMConfig + + >>> # Initializing a MiniCPM minicpm-7b style configuration + >>> configuration = MiniCPMConfig() + + >>> # Initializing a model from the minicpm-7b style configuration + >>> model = MiniCPMModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "minicpm" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + pretraining_tp=1, + tie_word_embeddings=True, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + scale_emb=1, + dim_model_base=1, + scale_depth=1, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.scale_emb = scale_emb + self.dim_model_base = dim_model_base + self.scale_depth = scale_depth + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + try: + import flash_attn + self._attn_implementation = "flash_attention_2" + except: + pass + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") diff --git a/Unicorn_dense/bunny/model/language_model/minicpm/modeling_minicpm.py b/Unicorn_dense/bunny/model/language_model/minicpm/modeling_minicpm.py new file mode 100644 index 0000000000000000000000000000000000000000..acb19fbd03f3faa82e450ecc5384130ff38a9b27 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/minicpm/modeling_minicpm.py @@ -0,0 +1,1456 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch MiniCPM model.""" +import math +import warnings +from typing import List, Optional, Tuple, Union, Dict + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import ( + AttentionMaskConverter, + _prepare_4d_attention_mask, + _prepare_4d_causal_attention_mask, + _prepare_4d_causal_attention_mask_for_sdpa, +) +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13 +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from transformers.utils.import_utils import is_torch_fx_available +from .configuration_minicpm import MiniCPMConfig +import re + +try: + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa +except: + pass + + +# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph. +# It means that the function will not be traced through and simply appear as a node in the graph. +if is_torch_fx_available(): + if not is_torch_greater_or_equal_than_1_13: + import torch.fx + + _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "MiniCPMConfig" + + +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + warnings.warn( + "Calling `transformers.models.minicpm.modeling_minicpm._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask" + ) + return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len) + + +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + warnings.warn( + "Calling `transformers.models.minicpm.modeling_minicpm._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.minicpm.modeling_minicpm.AttentionMaskConverter._make_causal_mask" + ) + return AttentionMaskConverter._make_causal_mask( + input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length + ) + +# @torch.jit.script # type: ignore +def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float): + old_dtype = hidden.dtype + variance = hidden.to(torch.float32).pow(2).mean(dim=-1, keepdim=True) + hidden = (hidden * torch.rsqrt(variance + eps)).to(old_dtype) + return hidden * weight + + +class MiniCPMRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + MiniCPMRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + return rms_layernorm(hidden_states, self.weight, self.variance_epsilon) + + +ALL_LAYERNORM_LAYERS.append(MiniCPMRMSNorm) + + +class MiniCPMRotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + # seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.float32 + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +class MiniCPMLinearScalingRotaryEmbedding(MiniCPMRotaryEmbedding): + """MiniCPMRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +class MiniCPMDynamicNTKScalingRotaryEmbedding(MiniCPMRotaryEmbedding): + """MiniCPMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + # cos = cos[position_ids].unsqueeze(unsqueeze_dim) + # sin = sin[position_ids].unsqueeze(unsqueeze_dim) + # q_embed = (q * cos) + (rotate_half(q) * sin) + # k_embed = (k * cos) + (rotate_half(k) * sin) + orig_dtype = k.dtype + cos = cos[position_ids].unsqueeze(unsqueeze_dim) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(unsqueeze_dim) # [bs, 1, seq_len, dim] + q_fp32 = q.to(dtype=torch.float32, device=q.device) + k_fp32 = k.to(dtype=torch.float32, device=k.device) + q_embed = (q_fp32 * cos) + (rotate_half(q_fp32) * sin) + k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin) + return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype) + +class MiniCPMMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + if self.config.pretraining_tp > 1: + slice = self.intermediate_size // self.config.pretraining_tp + gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) + up_proj_slices = self.up_proj.weight.split(slice, dim=0) + down_proj_slices = self.down_proj.weight.split(slice, dim=1) + + gate_proj = torch.cat( + [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1 + ) + up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1) + + intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) + down_proj = [ + F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp) + ] + down_proj = sum(down_proj) + else: + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + return down_proj + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + + +class MiniCPMAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: MiniCPMConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = MiniCPMRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = MiniCPMLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = MiniCPMDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + if self.config.pretraining_tp > 1: + key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp + query_slices = self.q_proj.weight.split( + (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 + ) + key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) + value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) + + query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] + query_states = torch.cat(query_states, dim=-1) + + key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] + key_states = torch.cat(key_states, dim=-1) + + value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] + value_states = torch.cat(value_states, dim=-1) + + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states.to(torch.float32), seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + if self.config.pretraining_tp > 1: + attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) + o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) + attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) + else: + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class MiniCPMFlashAttention2(MiniCPMAttention): + """ + MiniCPM flash attention module. This module inherits from `MiniCPMAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # MiniCPMFlashAttention2 attention does not support output_attentions + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states.to(torch.float32), seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (MiniCPMRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + # Handle the case where the model is quantized + if hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in MiniCPMFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +class MiniCPMSdpaAttention(MiniCPMAttention): + """ + MiniCPM attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `MiniCPMAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from MiniCPMAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "MiniCPMModel is using MiniCPMSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +MINICPM_ATTENTION_CLASSES = { + "eager": MiniCPMAttention, + "flash_attention_2": MiniCPMFlashAttention2, + "sdpa": MiniCPMSdpaAttention, +} + + +class MiniCPMDecoderLayer(nn.Module): + def __init__(self, config: MiniCPMConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = MINICPM_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) + + self.mlp = MiniCPMMLP(config) + self.input_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.scale_depth = config.scale_depth + self.num_hidden_layers = config.num_hidden_layers + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs, + ) + + hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers)) + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers)) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +MINICPM_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MiniCPMConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare MiniCPM Model outputting raw hidden-states without any specific head on top.", + MINICPM_START_DOCSTRING, +) +class MiniCPMPreTrainedModel(PreTrainedModel): + config_class = MiniCPMConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["MiniCPMDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +MINICPM_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare MiniCPM Model outputting raw hidden-states without any specific head on top.", + MINICPM_START_DOCSTRING, +) +class MiniCPMModel(MiniCPMPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`] + + Args: + config: MiniCPMConfig + """ + + def __init__(self, config: MiniCPMConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [MiniCPMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._use_sdpa = config._attn_implementation == "sdpa" + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + + self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb + + + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._use_sdpa and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class MiniCPMForCausalLM(MiniCPMPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = MiniCPMModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, MiniCPMForCausalLM + + >>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.pretraining_tp > 1: + lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0) + logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)] + logits = torch.cat(logits, dim=-1) + else: + logits = self.lm_head(hidden_states / (self.config.hidden_size / self.config.dim_model_base)) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + else: + remove_prefix_length = input_ids.shape[1] - 1 + input_ids = input_ids[:, remove_prefix_length:] + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + @torch.inference_mode() + def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", + max_length: int = 4096, num_beams=1, do_sample=True, top_p=0.8, temperature=0.3, logits_processor=None, + **kwargs): + if history is None: + history = [] + if logits_processor: + gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, + "temperature": temperature, "logits_processor": logits_processor, **kwargs} + else: + gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, + "temperature": temperature, "logits_processor": logits_processor, **kwargs} + + history.append({"role": role, "content": query}) + history_str = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=False) + inputs = tokenizer(history_str, return_tensors='pt').to(self.device) + outputs = self.generate(**inputs, **gen_kwargs) + outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] + response = tokenizer.decode(outputs) + pattern = re.compile(r".*?(?=|<用户>)", re.DOTALL) + matches = pattern.findall(response) + if len(matches) > 0: + response = matches[0] + history.append({"role": "assistant", "content": response}) + return response, history + + +@add_start_docstrings( + """ + The MiniCPM Model transformer with a sequence classification head on top (linear layer). + + [`MiniCPMForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + MINICPM_START_DOCSTRING, +) +class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = MiniCPMModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to( + logits.device + ) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/Unicorn_dense/bunny/model/language_model/phi/__init__.py b/Unicorn_dense/bunny/model/language_model/phi/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..68f18405e1c7041317605e937d420c94891602f5 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/phi/__init__.py @@ -0,0 +1,69 @@ +# Copyright 2023 Microsoft and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import TYPE_CHECKING + +from transformers.utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_sentencepiece_available, + is_tokenizers_available, + is_torch_available, +) + + +_import_structure = { + "configuration_phi": ["PHI_PRETRAINED_CONFIG_ARCHIVE_MAP", "PhiConfig"], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_phi"] = [ + "PHI_PRETRAINED_MODEL_ARCHIVE_LIST", + "PhiPreTrainedModel", + "PhiModel", + "PhiForCausalLM", + "PhiForSequenceClassification", + "PhiForTokenClassification", + ] + + +if TYPE_CHECKING: + from .configuration_phi import PHI_PRETRAINED_CONFIG_ARCHIVE_MAP, PhiConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_phi import ( + PHI_PRETRAINED_MODEL_ARCHIVE_LIST, + PhiForCausalLM, + PhiForSequenceClassification, + PhiForTokenClassification, + PhiModel, + PhiPreTrainedModel, + ) + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/Unicorn_dense/bunny/model/language_model/phi/__pycache__/__init__.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/phi/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aed5baf9e532dc2d6080023e91b1bcce9761673e Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/phi/__pycache__/__init__.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/phi/__pycache__/configuration_phi.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/phi/__pycache__/configuration_phi.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cfc38d86489148c7a27d2ae8864a657bc7aef3a2 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/phi/__pycache__/configuration_phi.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/phi/__pycache__/modeling_phi.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/phi/__pycache__/modeling_phi.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..87c7b26bebfecbcbf9a4eba672fd92d8932a1ac6 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/phi/__pycache__/modeling_phi.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/phi/configuration_phi.py b/Unicorn_dense/bunny/model/language_model/phi/configuration_phi.py new file mode 100644 index 0000000000000000000000000000000000000000..eda2bc23d8ff8f4345af575f5cde98bae4813191 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/phi/configuration_phi.py @@ -0,0 +1,195 @@ +# coding=utf-8 +# Copyright 2023 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Phi model configuration""" + + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +PHI_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "microsoft/phi-1": "https://huggingface.co/microsoft/phi-1/resolve/main/config.json", + "microsoft/phi-1_5": "https://huggingface.co/microsoft/phi-1_5/resolve/main/config.json", + "microsoft/phi-2": "https://huggingface.co/microsoft/phi-2/resolve/main/config.json", +} + + +class PhiConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`PhiModel`]. It is used to instantiate an Phi + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Phi + [microsoft/phi-1](https://huggingface.co/microsoft/phi-1). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 51200): + Vocabulary size of the Phi model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`PhiModel`]. + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 8192): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + resid_pdrop (`float`, *optional*, defaults to 0.0): + Dropout probability for mlp outputs. + embd_pdrop (`int`, *optional*, defaults to 0.0): + The dropout ratio for the embeddings. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio after computing the attention scores. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Phi-1 and Phi-1.5 supports up to 2048 + tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format + is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalPersimmon/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This + is an experimental feature, subject to breaking API changes in future versions. + partial_rotary_factor (`float`, *optional*, defaults to 0.5): + Percentage of the query and keys which will have rotary embedding. + qk_layernorm (`bool`, *optional*, defaults to `False`): + Whether or not to normalize the Queries and Keys after projecting the hidden states. + bos_token_id (`int`, *optional*, defaults to 1): + Denotes beginning of sequences token id. + eos_token_id (`int`, *optional*, defaults to 2): + Denotes end of sequences token id. + + Example: + + ```python + >>> from transformers import PhiModel, PhiConfig + + >>> # Initializing a Phi-1 style configuration + >>> configuration = PhiConfig.from_pretrained("microsoft/phi-1") + + >>> # Initializing a model from the configuration + >>> model = PhiModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "phi" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=51200, + hidden_size=2048, + intermediate_size=8192, + num_hidden_layers=24, + num_attention_heads=32, + num_key_value_heads=None, + resid_pdrop=0.0, + embd_pdrop=0.0, + attention_dropout=0.0, + hidden_act="gelu_new", + max_position_embeddings=2048, + initializer_range=0.02, + layer_norm_eps=1e-5, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + partial_rotary_factor=0.5, + qk_layernorm=False, + bos_token_id=1, + eos_token_id=2, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attention_dropout = attention_dropout + self.hidden_act = hidden_act + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.partial_rotary_factor = partial_rotary_factor + self.qk_layernorm = qk_layernorm + self._rope_scaling_validation() + + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") \ No newline at end of file diff --git a/Unicorn_dense/bunny/model/language_model/phi/modeling_phi.py b/Unicorn_dense/bunny/model/language_model/phi/modeling_phi.py new file mode 100644 index 0000000000000000000000000000000000000000..8f14944b03c52bfc6e3442c901a8e3c6ad694c7c --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/phi/modeling_phi.py @@ -0,0 +1,1374 @@ +# coding=utf-8 +# Copyright 2023 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" PyTorch Phi model.""" + + +import math +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_phi import PhiConfig + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "microsoft/phi-1" +_CONFIG_FOR_DOC = "PhiConfig" + +PHI_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "microsoft/phi-1", + "microsoft/phi-1_5", + "microsoft/phi-2", + # See all Phi models at https://huggingface.co/models?filter=phi +] + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Phi +class PhiRotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Phi +class PhiLinearScalingRotaryEmbedding(PhiRotaryEmbedding): + """PhiRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + t = t / self.scaling_factor + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Phi +class PhiDynamicNTKScalingRotaryEmbedding(PhiRotaryEmbedding): + """PhiRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Phi +class PhiMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class PhiAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: PhiConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.partial_rotary_factor = config.partial_rotary_factor + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.dense = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=True) + + self.qk_layernorm = config.qk_layernorm + if self.qk_layernorm: + self.q_layernorm = nn.LayerNorm( + config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True + ) + self.k_layernorm = nn.LayerNorm( + config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True + ) + + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = PhiRotaryEmbedding( + int(self.partial_rotary_factor * self.head_dim), + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = PhiLinearScalingRotaryEmbedding( + int(self.partial_rotary_factor * self.head_dim), + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = PhiDynamicNTKScalingRotaryEmbedding( + int(self.partial_rotary_factor * self.head_dim), + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + if self.qk_layernorm: + query_states = self.q_layernorm(query_states) + key_states = self.k_layernorm(key_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + # Partial rotary embedding + query_rot, query_pass = ( + query_states[..., : self.rotary_emb.dim], + query_states[..., self.rotary_emb.dim :], + ) + key_rot, key_pass = ( + key_states[..., : self.rotary_emb.dim], + key_states[..., self.rotary_emb.dim :], + ) + # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor] + query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids) + + # [batch_size, seq_length, num_heads, head_dim] + query_states = torch.cat((query_rot, query_pass), dim=-1) + key_states = torch.cat((key_rot, key_pass), dim=-1) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + # Queries and keys upcast to fp32 is required by Phi-2 to avoid overflow + attn_weights = torch.matmul( + query_states.to(torch.float32), key_states.to(torch.float32).transpose(2, 3) + ) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.dense(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class PhiFlashAttention2(PhiAttention): + """ + Phi flash attention module. This module inherits from `PhiAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # PhiFlashAttention2 attention does not support output_attentions + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + if self.qk_layernorm: + query_states = self.q_layernorm(query_states) + key_states = self.k_layernorm(key_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + # Partial rotary embedding + query_rot, query_pass = ( + query_states[..., : self.rotary_emb.dim], + query_states[..., self.rotary_emb.dim :], + ) + key_rot, key_pass = ( + key_states[..., : self.rotary_emb.dim], + key_states[..., self.rotary_emb.dim :], + ) + # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor] + query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids) + + # [batch_size, seq_length, num_heads, head_dim] + query_states = torch.cat((query_rot, query_pass), dim=-1) + key_states = torch.cat((key_rot, key_pass), dim=-1) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_dropout = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. + + if query_states.dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len, dropout=attn_dropout, softmax_scale=None + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.dense(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +PHI_ATTENTION_CLASSES = { + "eager": PhiAttention, + "flash_attention_2": PhiFlashAttention2, +} + + +class PhiDecoderLayer(nn.Module): + def __init__(self, config: PhiConfig, layer_idx: int): + super().__init__() + self.self_attn = PHI_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx) + self.mlp = PhiMLP(config) + self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): + input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range + `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + attn_outputs, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + attn_outputs = self.resid_dropout(attn_outputs) + + feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states)) + hidden_states = attn_outputs + feed_forward_hidden_states + residual + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +PHI_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`PhiConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Phi Model outputting raw hidden-states without any specific head on top.", + PHI_START_DOCSTRING, +) +class PhiPreTrainedModel(PreTrainedModel): + config_class = PhiConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["PhiDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +PHI_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Phi Model outputting raw hidden-states without any specific head on top.", + PHI_START_DOCSTRING, +) +class PhiModel(PhiPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PhiDecoderLayer`] + + Args: + config: PhiConfig + """ + + def __init__(self, config: PhiConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.embed_dropout = nn.Dropout(config.embd_pdrop) + self.layers = nn.ModuleList( + [PhiDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + past_key_values_length = 0 + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + inputs_embeds = self.embed_dropout(inputs_embeds) + + # Attention mask. + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.final_layernorm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class PhiForCausalLM(PhiPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi,bias=False->bias=True + def __init__(self, config): + super().__init__(config) + self.model = PhiModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True) + + # Initialize weights and apply final processing + self.post_init() + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings + def get_input_embeddings(self): + return self.model.embed_tokens + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings + def get_output_embeddings(self): + return self.lm_head + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder + def set_decoder(self, decoder): + self.model = decoder + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, PhiForCausalLM + + >>> model = PhiForCausalLM.from_pretrained("microsoft/phi-1") + >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1") + + >>> prompt = "This is an example script ." + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + 'This is an example script .\n\n\n\nfrom typing import List\n\ndef find_most_common_letter(words: List[str' + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + else: + remove_prefix_length = input_ids.shape[1] - 1 + input_ids = input_ids[:, remove_prefix_length:] + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + if past_key_value := getattr(self.model.layers[0].self_attn, "past_key_value", None): + # generation with static cache + seen_tokens = past_key_value.get_seq_length() + input_ids = input_ids[:, seen_tokens:] + position_ids = position_ids[:, seen_tokens:] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The PhiModel with a sequence classification head on top (linear layer). + + [`PhiForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + PHI_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with LLAMA->PHI,Llama->Phi with self.transformer->self.model, transformer_outputs->model_outputs +class PhiForSequenceClassification(PhiPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = PhiModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + model_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = model_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + model_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=model_outputs.past_key_values, + hidden_states=model_outputs.hidden_states, + attentions=model_outputs.attentions, + ) + + +@add_start_docstrings( + """ + PhiModel with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + PHI_START_DOCSTRING, +) +# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with MPT->PHI,Mpt->Phi,self.transformer->self.model,transformer_outputs->model_outputs +class PhiForTokenClassification(PhiPreTrainedModel): + def __init__(self, config: PhiConfig): + super().__init__(config) + self.num_labels = config.num_labels + + self.model = PhiModel(config) + if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None: + classifier_dropout = config.classifier_dropout + elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **deprecated_arguments, + ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + model_outputs = self.model( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = model_outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + batch_size, seq_length = labels.shape + loss_fct = CrossEntropyLoss() + loss = loss_fct( + logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length) + ) + + if not return_dict: + output = (logits,) + model_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=model_outputs.hidden_states, + attentions=model_outputs.attentions, + ) \ No newline at end of file diff --git a/Unicorn_dense/bunny/model/language_model/phi3/__init__.py b/Unicorn_dense/bunny/model/language_model/phi3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b738e2b977d45e662f61aca1b569c16f0597e443 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/phi3/__init__.py @@ -0,0 +1,69 @@ +# Copyright 2024 Microsoft and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import TYPE_CHECKING + +from transformers.utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_sentencepiece_available, + is_tokenizers_available, + is_torch_available, +) + + +_import_structure = { + "configuration_phi3": ["PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP", "Phi3Config"], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_phi3"] = [ + "PHI3_PRETRAINED_MODEL_ARCHIVE_LIST", + "Phi3PreTrainedModel", + "Phi3Model", + "Phi3ForCausalLM", + "Phi3ForSequenceClassification", + "Phi3ForTokenClassification", + ] + + +if TYPE_CHECKING: + from .configuration_phi3 import PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP, Phi3Config + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_phi3 import ( + PHI3_PRETRAINED_MODEL_ARCHIVE_LIST, + Phi3ForCausalLM, + Phi3ForSequenceClassification, + Phi3ForTokenClassification, + Phi3Model, + Phi3PreTrainedModel, + ) + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/Unicorn_dense/bunny/model/language_model/phi3/__pycache__/__init__.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/phi3/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..01ba5d639ecf7b93077f2cb45be2b15b06f36795 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/phi3/__pycache__/__init__.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/phi3/__pycache__/configuration_phi3.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/phi3/__pycache__/configuration_phi3.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..789f15687beebe56537411b4dba50648e23ad7f6 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/phi3/__pycache__/configuration_phi3.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/phi3/__pycache__/modeling_phi3.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/phi3/__pycache__/modeling_phi3.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7fff402d1e405120aecf8c93fe82ad4e07c885d Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/phi3/__pycache__/modeling_phi3.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/phi3/configuration_phi3.py b/Unicorn_dense/bunny/model/language_model/phi3/configuration_phi3.py new file mode 100644 index 0000000000000000000000000000000000000000..f4553db23ac65c608fd150a14acbd04d3ff80a0f --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/phi3/configuration_phi3.py @@ -0,0 +1,213 @@ +# coding=utf-8 +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Phi-3 model configuration""" + + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "microsoft/Phi-3-mini-4k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json", + "microsoft/Phi-3-mini-128k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json", +} + + +class Phi3Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3 + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the + [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 32064): + Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Phi3Model`]. + hidden_size (`int`, *optional*, defaults to 3072): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 8192): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + resid_pdrop (`float`, *optional*, defaults to 0.0): + Dropout probability for mlp outputs. + embd_pdrop (`int`, *optional*, defaults to 0.0): + The dropout ratio for the embeddings. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio after computing the attention scores. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. + original_max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model was trained with. This is used to determine the size of the + original RoPE embeddings when using long scaling. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon value used for the RMSNorm. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`dict`, *optional*): + The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must + contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and + the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size + divided by the number of attention heads divided by 2. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 32000): + The id of the "end-of-sequence" token. + pad_token_id (`int`, *optional*, defaults to 32000): + The id of the padding token. + sliding_window (`int`, *optional*): + Sliding window attention window size. If `None`, no sliding window is applied. + + Example: + + ```python + >>> from transformers import Phi3Model, Phi3Config + + >>> # Initializing a Phi-3 style configuration + >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct") + + >>> # Initializing a model from the configuration + >>> model = Phi3Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "phi3" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32064, + hidden_size=3072, + intermediate_size=8192, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + resid_pdrop=0.0, + embd_pdrop=0.0, + attention_dropout=0.0, + hidden_act="silu", + max_position_embeddings=4096, + original_max_position_embeddings=4096, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + bos_token_id=1, + eos_token_id=32000, + pad_token_id=32000, + sliding_window=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attention_dropout = attention_dropout + self.hidden_act = hidden_act + self.max_position_embeddings = max_position_embeddings + self.original_max_position_embeddings = original_max_position_embeddings + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + self.sliding_window = sliding_window + + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3: + raise ValueError( + "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_short_factor = self.rope_scaling.get("short_factor", None) + rope_scaling_long_factor = self.rope_scaling.get("long_factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]: + raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}") + if not ( + isinstance(rope_scaling_short_factor, list) + and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor) + ): + raise ValueError( + f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}" + ) + if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2: + raise ValueError( + f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}" + ) + if not ( + isinstance(rope_scaling_long_factor, list) + and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor) + ): + raise ValueError( + f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}" + ) + if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2: + raise ValueError( + f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}" + ) diff --git a/Unicorn_dense/bunny/model/language_model/phi3/modeling_phi3.py b/Unicorn_dense/bunny/model/language_model/phi3/modeling_phi3.py new file mode 100644 index 0000000000000000000000000000000000000000..0f001e5721d31cd7448dc1c5630baedc90d7ca82 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/phi3/modeling_phi3.py @@ -0,0 +1,1597 @@ +# coding=utf-8 +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" PyTorch Phi-3 model.""" + +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_phi3 import Phi3Config + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct" +_CONFIG_FOR_DOC = "Phi3Config" + +PHI3_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "microsoft/Phi-3-mini-4k-instruct", + "microsoft/Phi-3-mini-128k-instruct", + # See all Phi-3 models at https://huggingface.co/models?filter=Phi-3 +] + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3 +class Phi3RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Phi3RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3 +class Phi3RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.register_buffer("inv_freq", None, persistent=False) + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if self.inv_freq is None: + self.inv_freq = 1.0 / ( + self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim) + ) + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding): + def __init__(self, dim, config, device=None): + super().__init__(dim, config.max_position_embeddings, config.rope_theta, device) + + self.short_factor = config.rope_scaling["short_factor"] + self.long_factor = config.rope_scaling["long_factor"] + self.original_max_position_embeddings = config.original_max_position_embeddings + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + seq_len = torch.max(position_ids) + 1 + if seq_len > self.original_max_position_embeddings: + ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device) + else: + ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device) + + inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim + self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape) + + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + + scale = self.max_position_embeddings / self.original_max_position_embeddings + if scale <= 1.0: + scaling_factor = 1.0 + else: + scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings)) + + cos = emb.cos() * scaling_factor + sin = emb.sin() * scaling_factor + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding): + def __init__(self, dim, config, device=None): + super().__init__(dim, config.max_position_embeddings, config.rope_theta, device) + + self.short_factor = config.rope_scaling["short_factor"] + self.long_factor = config.rope_scaling["long_factor"] + self.original_max_position_embeddings = config.original_max_position_embeddings + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + seq_len = torch.max(position_ids) + 1 + if seq_len > self.original_max_position_embeddings: + ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device) + else: + ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device) + + inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim + self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape) + + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + + scale = self.max_position_embeddings / self.original_max_position_embeddings + if scale <= 1.0: + scaling_factor = 1.0 + else: + scaling_factor = 0.1 * math.log(scale) + 1.0 + + cos = emb.cos() * scaling_factor + sin = emb.sin() * scaling_factor + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class Phi3MLP(nn.Module): + def __init__(self, config): + super().__init__() + + self.config = config + self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False) + self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + + self.activation_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + up_states = self.gate_up_proj(hidden_states) + + gate, up_states = up_states.chunk(2, dim=-1) + up_states = up_states * self.activation_fn(gate) + + return self.down_proj(up_states) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Phi3Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.original_max_position_embeddings = config.original_max_position_embeddings + self.rope_theta = config.rope_theta + self.rope_scaling = config.rope_scaling + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False) + self._init_rope() + + def _init_rope(self): + if self.rope_scaling is None: + self.rotary_emb = Phi3RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + if scaling_type == "su": + self.rotary_emb = Phi3SuScaledRotaryEmbedding(self.head_dim, self.config) + elif scaling_type == "yarn": + self.rotary_emb = Phi3YarnScaledRotaryEmbedding(self.head_dim, self.config) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + logger.warning_once("You are not running the flash-attention implementation, expect numerical differences.") + + bsz, q_len, _ = hidden_states.size() + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class Phi3FlashAttention2(Phi3Attention): + """ + Phi-3 flash attention module. This module inherits from `Phi3Attention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # Phi3FlashAttention2 attention does not support output_attentions + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention. Please use `attn_implementation='eager'` or upgrade flash-attn library." + ) + raise ValueError("The current flash attention version does not support sliding window attention.") + + output_attentions = False + + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + bsz, q_len, _ = hidden_states.size() + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, position_ids, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_dropout = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. + + if query_states.dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.qkv_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=attn_dropout, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._flash_attention_forward + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi3 +# TODO @Arthur no longer copied from LLama after static cache +class Phi3SdpaAttention(Phi3Attention): + """ + Phi3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `Phi3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from Phi3Attention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "Phi3Model is using Phi3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +PHI3_ATTENTION_CLASSES = { + "eager": Phi3Attention, + "flash_attention_2": Phi3FlashAttention2, + "sdpa": Phi3SdpaAttention, +} + + +class Phi3DecoderLayer(nn.Module): + def __init__(self, config: Phi3Config, layer_idx: int): + super().__init__() + + self.config = config + self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx) + + self.mlp = Phi3MLP(config) + self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.resid_attn_dropout = nn.Dropout(config.resid_pdrop) + self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop) + self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + """ + Args: + hidden_states (`torch.FloatTensor`): + input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range + `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + attn_outputs, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = residual + self.resid_attn_dropout(attn_outputs) + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + self.resid_mlp_dropout(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +PHI3_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Phi3Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Phi-3 model outputting raw hidden-states without any specific head on top.", + PHI3_START_DOCSTRING, +) +class Phi3PreTrainedModel(PreTrainedModel): + config_class = Phi3Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Phi3DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = False + _supports_cache_class = True + + _version = "0.0.5" + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +PHI3_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Phi-3 model outputting raw hidden-states without any specific head on top.", + PHI3_START_DOCSTRING, +) +class Phi3Model(Phi3PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi3DecoderLayer`] + + Args: + config: Phi3Config + """ + + def __init__(self, config: Phi3Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.embed_dropout = nn.Dropout(config.embd_pdrop) + self.layers = nn.ModuleList( + [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._attn_implementation = config._attn_implementation + self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + past_key_values_length = 0 + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class Phi3ForCausalLM(Phi3PreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3 + def __init__(self, config): + super().__init__(config) + self.model = Phi3Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings + def get_input_embeddings(self): + return self.model.embed_tokens + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings + def get_output_embeddings(self): + return self.lm_head + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder + def set_decoder(self, decoder): + self.model = decoder + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder + def get_decoder(self): + return self.model + + # Ignore copy + @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, Phi3ForCausalLM + + >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct") + >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct") + + >>> prompt = "This is an example script ." + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + 'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum' + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + # Copied from transformers.models.persimmon.modeling_persimmon.PersimmonForCausalLM.prepare_inputs_for_generation + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + else: + remove_prefix_length = input_ids.shape[1] - 1 + input_ids = input_ids[:, remove_prefix_length:] + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The [`Phi3Model`] with a sequence classification head on top (linear layer). + + [`Phi3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + PHI3_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi3, LLAMA->PHI3, self.transformer->self.model, transformer_outputs->model_outputs +class Phi3ForSequenceClassification(Phi3PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Phi3Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + model_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = model_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + model_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=model_outputs.past_key_values, + hidden_states=model_outputs.hidden_states, + attentions=model_outputs.attentions, + ) + + +@add_start_docstrings( + """ + [`Phi3Model`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + PHI3_START_DOCSTRING, +) +# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi3,MPT->PHI3,self.transformer->self.model,transformer_outputs->model_outputs +class Phi3ForTokenClassification(Phi3PreTrainedModel): + def __init__(self, config: Phi3Config): + super().__init__(config) + self.num_labels = config.num_labels + + self.model = Phi3Model(config) + if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None: + classifier_dropout = config.classifier_dropout + elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **deprecated_arguments, + ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + model_outputs = self.model( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = model_outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + batch_size, seq_length = labels.shape + loss_fct = CrossEntropyLoss() + loss = loss_fct( + logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length) + ) + + if not return_dict: + output = (logits,) + model_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=model_outputs.hidden_states, + attentions=model_outputs.attentions, + ) diff --git a/Unicorn_dense/bunny/model/language_model/qwen2/__init__.py b/Unicorn_dense/bunny/model/language_model/qwen2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b739f931448214584fffbdc5ffa8c36473001035 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/qwen2/__init__.py @@ -0,0 +1,80 @@ +# Copyright 2024 The Qwen Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from transformers.utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_tokenizers_available, + is_torch_available, +) + + +_import_structure = { + "configuration_qwen2": ["QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Qwen2Config"], + "tokenization_qwen2": ["Qwen2Tokenizer"], +} + +try: + if not is_tokenizers_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["tokenization_qwen2_fast"] = ["Qwen2TokenizerFast"] + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_qwen2"] = [ + "Qwen2ForCausalLM", + "Qwen2Model", + "Qwen2PreTrainedModel", + "Qwen2ForSequenceClassification", + ] + + +if TYPE_CHECKING: + from .configuration_qwen2 import QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP, Qwen2Config + from .tokenization_qwen2 import Qwen2Tokenizer + + try: + if not is_tokenizers_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .tokenization_qwen2_fast import Qwen2TokenizerFast + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_qwen2 import ( + Qwen2ForCausalLM, + Qwen2ForSequenceClassification, + Qwen2Model, + Qwen2PreTrainedModel, + ) + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) \ No newline at end of file diff --git a/Unicorn_dense/bunny/model/language_model/qwen2/__pycache__/__init__.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/qwen2/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa378bd44767b387408631c5c356718eb44c49f4 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/qwen2/__pycache__/__init__.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/qwen2/__pycache__/configuration_qwen2.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/qwen2/__pycache__/configuration_qwen2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..432b6ed02fce610207d8aaee70d9b8b7e58e2041 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/qwen2/__pycache__/configuration_qwen2.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/qwen2/__pycache__/modeling_qwen2.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/qwen2/__pycache__/modeling_qwen2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..832a6bd2b158b44141ea736a3b7f493b51a30651 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/qwen2/__pycache__/modeling_qwen2.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/qwen2/configuration_qwen2.py b/Unicorn_dense/bunny/model/language_model/qwen2/configuration_qwen2.py new file mode 100644 index 0000000000000000000000000000000000000000..c3d72b298d2f10ce17458198058237a0784ad26b --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/qwen2/configuration_qwen2.py @@ -0,0 +1,144 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Qwen2 model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json", +} + + +class Qwen2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a + Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of + Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 151936): + Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Qwen2Model`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 22016): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 32): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + use_sliding_window (`bool`, *optional*, defaults to `False`): + Whether to use sliding window attention. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention (SWA) window size. If not specified, will default to `4096`. + max_window_layers (`int`, *optional*, defaults to 28): + The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import Qwen2Model, Qwen2Config + + >>> # Initializing a Qwen2 style configuration + >>> configuration = Qwen2Config() + + >>> # Initializing a model from the Qwen2-7B style configuration + >>> model = Qwen2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qwen2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=151936, + hidden_size=4096, + intermediate_size=22016, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=32, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + use_sliding_window=False, + sliding_window=4096, + max_window_layers=28, + attention_dropout=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.use_sliding_window = use_sliding_window + self.sliding_window = sliding_window + self.max_window_layers = max_window_layers + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) \ No newline at end of file diff --git a/Unicorn_dense/bunny/model/language_model/qwen2/modeling_qwen2.py b/Unicorn_dense/bunny/model/language_model/qwen2/modeling_qwen2.py new file mode 100644 index 0000000000000000000000000000000000000000..fed91276a28ae23072c31f15c86d5334989c2366 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/qwen2/modeling_qwen2.py @@ -0,0 +1,1403 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Qwen2 model.""" +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_qwen2 import Qwen2Config + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + + +_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta" +_CONFIG_FOR_DOC = "Qwen2Config" + +QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "Qwen/Qwen2-7B-beta", + # See all Qwen2 models at https://huggingface.co/models?filter=qwen2 +] + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2 +class Qwen2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Qwen2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Qwen2 +class Qwen2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2 +class Qwen2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Qwen2Attention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self.rotary_emb = Qwen2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class Qwen2FlashAttention2(Qwen2Attention): + """ + Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention` + as the weights of the module stays untouched. The only required change would be on the forward pass + where it needs to correctly call the public API of flash attention and deal with padding tokens + in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom + config.max_window_layers layers. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and self.config.use_sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Decide whether to use SWA or not by layer index. + if use_sliding_windows and self.layer_idx >= self.config.max_window_layers: + use_sliding_windows = False + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# Copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Qwen2 +class Qwen2SdpaAttention(Qwen2Attention): + """ + Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from Qwen2Attention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +QWEN2_ATTENTION_CLASSES = { + "eager": Qwen2Attention, + "flash_attention_2": Qwen2FlashAttention2, + "sdpa": Qwen2SdpaAttention, +} + + +class Qwen2DecoderLayer(nn.Module): + def __init__(self, config: Qwen2Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + if config.use_sliding_window and config._attn_implementation != "flash_attention_2": + logger.warning_once( + f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; " + "unexpected results may be encountered." + ) + self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + + self.mlp = Qwen2MLP(config) + self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +QWEN2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Qwen2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.", + QWEN2_START_DOCSTRING, +) +class Qwen2PreTrainedModel(PreTrainedModel): + config_class = Qwen2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Qwen2DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +QWEN2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.", + QWEN2_START_DOCSTRING, +) +class Qwen2Model(Qwen2PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`] + + Args: + config: Qwen2Config + """ + + def __init__(self, config: Qwen2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._attn_implementation = config._attn_implementation + self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class Qwen2ForCausalLM(Qwen2PreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = Qwen2Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, Qwen2ForCausalLM + + >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + else: + remove_prefix_length = input_ids.shape[1] - 1 + input_ids = input_ids[:, remove_prefix_length:] + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The Qwen2 Model transformer with a sequence classification head on top (linear layer). + + [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + QWEN2_START_DOCSTRING, +) +class Qwen2ForSequenceClassification(Qwen2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Qwen2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) \ No newline at end of file diff --git a/Unicorn_dense/bunny/model/language_model/qwen2/tokenization_qwen2.py b/Unicorn_dense/bunny/model/language_model/qwen2/tokenization_qwen2.py new file mode 100644 index 0000000000000000000000000000000000000000..37f18030e8ef58d0e8d4aed466a91fc75cc44359 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/qwen2/tokenization_qwen2.py @@ -0,0 +1,345 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Qwen2.""" + +import json +import os +import unicodedata +from functools import lru_cache +from typing import Optional, Tuple + +import regex as re + +from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"}, + "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"}, +} + +MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768} + +PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" + + +@lru_cache() +# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control + characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab + if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for + decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup + tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class Qwen2Tokenizer(PreTrainedTokenizer): + """ + Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding. + + Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import Qwen2Tokenizer + + >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer") + >>> tokenizer("Hello world")["input_ids"] + [9707, 1879] + + >>> tokenizer(" Hello world")["input_ids"] + [21927, 1879] + ``` + This is expected. + + You should not use GPT2Tokenizer instead, because of the different pretokenization rules. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*): + The beginning of sequence token. Not applicable for this tokenizer. + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The end of sequence token. + pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The token used for padding, for example when batching sequences of different lengths. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not the model should cleanup the spaces that were added when splitting the input text during the + tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces. + split_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the special tokens should be split during the tokenization process. The default behavior is + to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") = + ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<', + '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = MAX_MODEL_INPUT_SIZES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + merges_file, + errors="replace", + unk_token="<|endoftext|>", + bos_token=None, + eos_token="<|endoftext|>", + pad_token="<|endoftext|>", + clean_up_tokenization_spaces=False, + split_special_tokens=False, + **kwargs, + ): + # Qwen vocab does not contain control tokens; added tokens need to be special + bos_token = ( + AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(bos_token, str) + else bos_token + ) + eos_token = ( + AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(eos_token, str) + else eos_token + ) + unk_token = ( + AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(unk_token, str) + else unk_token + ) + pad_token = ( + AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(pad_token, str) + else pad_token + ) + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + bpe_merges = [] + with open(merges_file, encoding="utf-8") as merges_handle: + for line in merges_handle: + line = line.strip() + if not line or line.startswith("#"): + continue + bpe_merges.append(tuple(line.split())) + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + # NOTE: the cache can grow without bound and will get really large for long running processes + # (esp. for texts of language that do not use space between word, e.g. Chinese); technically + # not a memory leak but appears as one. + # GPT2Tokenizer has the same problem, so let's be consistent. + self.cache = {} + + self.pat = re.compile(PRETOKENIZE_REGEX) + + if kwargs.get("add_prefix_space", False): + logger.warning_once( + f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect." + ) + + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + unk_token=unk_token, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + split_special_tokens=split_special_tokens, + **kwargs, + ) + + @property + def vocab_size(self) -> int: + return len(self.encoder) + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) + return bpe_tokens + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) + return text + + def decode( + self, + token_ids, + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = False, + spaces_between_special_tokens: bool = False, + **kwargs, + ) -> str: + # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers + # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer + return super().decode( + token_ids, + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + spaces_between_special_tokens=spaces_between_special_tokens, + **kwargs, + ) + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + def prepare_for_tokenization(self, text, **kwargs): + text = unicodedata.normalize("NFC", text) + return (text, kwargs) \ No newline at end of file diff --git a/Unicorn_dense/bunny/model/language_model/qwen2/tokenization_qwen2_fast.py b/Unicorn_dense/bunny/model/language_model/qwen2/tokenization_qwen2_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..dda95588b84c4ef5a3098e5e655d86ad142675bf --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/qwen2/tokenization_qwen2_fast.py @@ -0,0 +1,143 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Qwen2.""" + +from typing import Optional, Tuple + +from transformers.tokenization_utils import AddedToken +from transformers.tokenization_utils_fast import PreTrainedTokenizerFast +from transformers.utils import logging +from .tokenization_qwen2 import Qwen2Tokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", + "tokenizer_file": "tokenizer.json", +} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"}, + "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"}, + "tokenizer_file": { + "qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/tokenizer.json" + }, +} + +MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768} + + +class Qwen2TokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level + Byte-Pair-Encoding. + + Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import Qwen2TokenizerFast + + >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer") + >>> tokenizer("Hello world")["input_ids"] + [9707, 1879] + + >>> tokenizer(" Hello world")["input_ids"] + [21927, 1879] + ``` + This is expected. + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`, *optional*): + Path to the vocabulary file. + merges_file (`str`, *optional*): + Path to the merges file. + tokenizer_file (`str`, *optional*): + Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that + contains everything needed to load the tokenizer. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. Not applicable to this tokenizer. + bos_token (`str`, *optional*): + The beginning of sequence token. Not applicable for this tokenizer. + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The end of sequence token. + pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The token used for padding, for example when batching sequences of different lengths. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = MAX_MODEL_INPUT_SIZES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = Qwen2Tokenizer + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + unk_token="<|endoftext|>", + bos_token=None, + eos_token="<|endoftext|>", + pad_token="<|endoftext|>", + **kwargs, + ): + # We need to at least pass vocab_file and merges_file to base class + # in case a slow tokenizer needs to be initialized; other can be + # configured through files. + # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token + + bos_token = ( + AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(bos_token, str) + else bos_token + ) + eos_token = ( + AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(eos_token, str) + else eos_token + ) + unk_token = ( + AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(unk_token, str) + else unk_token + ) + pad_token = ( + AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(pad_token, str) + else pad_token + ) + + super().__init__( + vocab_file, + merges_file, + tokenizer_file=tokenizer_file, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + **kwargs, + ) + + # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) \ No newline at end of file diff --git a/Unicorn_dense/bunny/model/language_model/stable_lm/__pycache__/configuration_stablelm_epoch.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/stable_lm/__pycache__/configuration_stablelm_epoch.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0cee389ddfcc87a6f959cb7a4cf3fd32c545d5a Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/stable_lm/__pycache__/configuration_stablelm_epoch.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/stable_lm/__pycache__/modeling_stablelm_epoch.cpython-310.pyc b/Unicorn_dense/bunny/model/language_model/stable_lm/__pycache__/modeling_stablelm_epoch.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..164008cde8d3ad477ef9d95c3d1598770e62fbb2 Binary files /dev/null and b/Unicorn_dense/bunny/model/language_model/stable_lm/__pycache__/modeling_stablelm_epoch.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/language_model/stable_lm/configuration_stablelm_epoch.py b/Unicorn_dense/bunny/model/language_model/stable_lm/configuration_stablelm_epoch.py new file mode 100644 index 0000000000000000000000000000000000000000..ea24f38659d600c66dc00767eb52d9a7115cffe9 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/stable_lm/configuration_stablelm_epoch.py @@ -0,0 +1,113 @@ +# Copyright 2023 Stability and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" StableLM Epoch model configuration""" +from transformers import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + + +class StableLMEpochConfig(PretrainedConfig): + r""" + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 50_304): + Vocabulary size of the StableLM model. Defines the number of different tokens that + can be represented by the `inputs_ids` passed when calling [`StableLMEpochModel`]. + intermediate_size (`int`, *optional*, defaults to 6912): + Dimension of the MLP representations. + hidden_size (`int`, *optional*, defaults to 2560): + Dimension of the decoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string). + rope_pct (`float`, *optional*, defaults to 1.0): + Percentage of hidden dimensions to allocate to rotary embeddings. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 1e-5): + The standard deviation of the truncated_normal_initializer for initializing + all weight matrices. + norm_eps (`float`, *optional*, defaults to 1e-8): + The epsilon used by the normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions + (not used by all models). Only relevant if `config.is_decoder=True`. + use_qkv_bias (`bool`, *optional*, defaults to `True`): + Whether or not the model should use bias for qkv layers. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + """ + model_type = "stablelm_epoch" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=50_304, + intermediate_size=6912, + hidden_size=2560, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=32, + hidden_act="silu", + rope_pct=0.25, + rope_theta=10_000, + max_position_embeddings=4096, + initializer_range=0.02, + norm_eps=1.0e-5, + use_cache=True, + use_qkv_bias=True, + bos_token_id=0, + eos_token_id=2, + tie_word_embeddings=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.intermediate_size = intermediate_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.rope_pct = rope_pct + self.rope_theta = rope_theta + self.initializer_range = initializer_range + self.norm_eps = norm_eps + self.use_cache = use_cache + self.use_qkv_bias = use_qkv_bias + self.tie_word_embeddings = tie_word_embeddings + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/Unicorn_dense/bunny/model/language_model/stable_lm/modeling_stablelm_epoch.py b/Unicorn_dense/bunny/model/language_model/stable_lm/modeling_stablelm_epoch.py new file mode 100644 index 0000000000000000000000000000000000000000..930b98f91a1b53d8a1cac67f6a030ab5cb078f32 --- /dev/null +++ b/Unicorn_dense/bunny/model/language_model/stable_lm/modeling_stablelm_epoch.py @@ -0,0 +1,917 @@ +# coding=utf-8 +# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This code is based off the following work: +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py +""" PyTorch StableLM Epoch model. """ +from typing import Optional, Tuple, Union +import math +import warnings + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from transformers.cache_utils import Cache +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging, is_flash_attn_greater_or_equal_2_10 + +from .configuration_stablelm_epoch import StableLMEpochConfig + +try: + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input +except: + flash_attn_func, flash_attn_varlen_func = None, None + index_first_axis, pad_input, unpad_input = None, None, None + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, + dtype: torch.dtype, + device: torch.device, + past_key_values_length: int = 0, +): + """Make causal mask used for bi-directional self-attention.""" + batch_size, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.finfo(torch.float16).min, device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(batch_size, 1, tgt_len, tgt_len + past_key_values_length) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, tgt_seq_len, src_seq_len]`.""" + batch_size, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(batch_size, 1, tgt_len, src_len).to(dtype) + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill( + inverted_mask.to(torch.bool), torch.finfo(dtype).min + ) + + +class RotaryEmbedding(nn.Module): + def __init__( + self, + dim: int, + max_position_embeddings: int, + base: int = 10_000, + device: Optional[torch.device] = None, + ): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype(), + ) + + def _set_cos_sin_cache(self, seq_len: int, device: torch.device, dtype: torch.dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.float32) + + # Don't do einsum, it converts fp32 to fp16 under AMP + # freqs = torch.einsum("i,j->ij", t, self.inv_freq) + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) + + def forward(self, x: torch.Tensor, seq_len: Optional[int] = None): + # x: [batch_size, num_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.get_default_dtype()) + return ( + self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + ) + + +def rotate_half(x: torch.Tensor): + """Rotates half the hidden dims of the input.""" + x1, x2 = torch.chunk(x, 2, dim=-1) + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. + cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [batch_size, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [batch_size, 1, seq_len, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class MLP(nn.Module): + def __init__(self, config: StableLMEpochConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) + self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) + self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + self.act_fn = nn.SiLU() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Attention(nn.Module): + def __init__(self, config: StableLMEpochConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.use_qkv_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.use_qkv_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.use_qkv_bias) + self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + + self._init_rope() + + def _init_rope(self): + self.rotary_ndims = int(self.head_dim * self.config.rope_pct) + self.rotary_emb = RotaryEmbedding( + self.rotary_ndims, + max_position_embeddings=self.config.max_position_embeddings, + base=self.config.rope_theta, + ) + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: torch.FloatTensor, + position_ids: torch.LongTensor, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_rot = query_states[..., : self.rotary_ndims] + query_pass = query_states[..., self.rotary_ndims :] + key_rot = key_states[..., : self.rotary_ndims] + key_pass = key_states[..., self.rotary_ndims :] + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids) + + # [batch_size, num_heads, seq_len, head_dim] + query_states = torch.cat((query_states, query_pass), dim=-1) + key_states = torch.cat((key_states, key_pass), dim=-1) + + if past_key_value is not None: + # Reuse k, v, self_attention + key_states = torch.cat((past_key_value[0], key_states), dim=2) + value_states = torch.cat((past_key_value[1], value_states), dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + # Repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # Upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + # Merge heads + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + # Final linear projection + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class FlashAttention2(Attention): + """ + Reference: https://github.com/huggingface/transformers/blob/5d36025ca13d05151b7a0c761e90d429c4644a30/src/transformers/models/llama/modeling_llama.py#L456 + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # FlashAttention2 attention does not support output_attentions + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_rot = query_states[..., : self.rotary_ndims] + query_pass = query_states[..., self.rotary_ndims :] + key_rot = key_states[..., : self.rotary_ndims] + key_pass = key_states[..., self.rotary_ndims :] + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids) + + # [batch_size, num_heads, seq_len, head_dim] + query_states = torch.cat((query_states, query_pass), dim=-1) + key_states = torch.cat((key_states, key_pass), dim=-1) + + if past_key_value is not None: + # Reuse k, v, self_attention + key_states = torch.cat((past_key_value[0], key_states), dim=2) + value_states = torch.cat((past_key_value[1], value_states), dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate + ) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in FlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +ATTENTION_CLASSES = { + "eager": Attention, + "flash_attention_2": FlashAttention2, +} + + +class DecoderLayer(nn.Module): + def __init__(self, config: StableLMEpochConfig): + super().__init__() + self.self_attn = ATTENTION_CLASSES[config._attn_implementation](config=config) + self.mlp = MLP(config) + self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps) + self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps) + + def forward( + self, + hidden_states: Optional[torch.FloatTensor], + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]: + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class StableLMEpochPreTrainedModel(PreTrainedModel): + """An abstract class to handle weights initialization and a simple interface + for downloading and loading pretrained models. + """ + + config_class = StableLMEpochConfig + base_model_prefix = "transformer" + supports_gradient_checkpointing = True + _no_split_modules = ["DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + + def _init_weights(self, module: nn.Module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module: nn.Module, value=False): + if isinstance(module, StableLMEpochModel): + module.gradient_checkpointing = value + + +class StableLMEpochModel(StableLMEpochPreTrainedModel): + def __init__(self, config: StableLMEpochConfig): + super().__init__(config) + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) + self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps) + + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value: nn.Module): + self.embed_tokens = value + + # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask + def _prepare_decoder_attention_mask( + self, + attention_mask: torch.Tensor, + input_shape: torch.Size, + inputs_embeds: torch.Tensor, + past_key_values_length: int, + ): + # Create causal mask + # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask( + attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ).to(inputs_embeds.device) + combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + + return combined_attention_mask + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # Retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time" + ) + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError( + "You have to specify either decoder_input_ids or decoder_inputs_embeds" + ) + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + # Embed positions + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), + dtype=torch.bool, + device=inputs_embeds.device, + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # Decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = ( + past_key_values[idx] if past_key_values is not None else None + ) + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, past_key_value, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # Add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class StableLMEpochForCausalLM(StableLMEpochPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: StableLMEpochConfig): + super().__init__(config) + + self.model = StableLMEpochModel(config) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings: nn.Module): + self.lm_head = new_embeddings + + def get_decoder(self): + return self.model + + def set_decoder(self, decoder): + self.model = decoder + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states).float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs, + ): + # Trim decoder_input_ids if past is used + if past_key_values is not None: + past_length = past_key_values[0][0].shape[2] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # Create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + + # If `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "position_ids": position_ids, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple( + past_state.index_select(0, beam_idx.to(past_state.device)) + for past_state in layer_past + ), + ) + return reordered_past + + +StableLMEpochConfig.register_for_auto_class() +StableLMEpochForCausalLM.register_for_auto_class("AutoModelForCausalLM") diff --git a/Unicorn_dense/bunny/model/multimodal_encoder/builder.py b/Unicorn_dense/bunny/model/multimodal_encoder/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..dbec1c6db823e7bc810465dddb28eff952eac0b7 --- /dev/null +++ b/Unicorn_dense/bunny/model/multimodal_encoder/builder.py @@ -0,0 +1,122 @@ +import torch +import torch.nn as nn + +import os +import pickle + + +# class LLM2CLIPTextTower(nn.Module): +# def __init__(self): +# super().__init__() + +# self.image_processor = CLIPImageProcessor.from_pretrained("/data/xmyu/Bunny_all/checkpoints/clip-vit-large-patch14-336") +# self.model = AutoModel.from_pretrained( +# "/data/xmyu/Bunny_all/checkpoints/LLM2CLIP-Openai-L-14-336", +# torch_dtype=torch.bfloat16, +# trust_remote_code=True).to('cuda').eval() + +# self.model.requires_grad_(False) + + +# self.llm_model_name = '/data/xmyu/Bunny_all/checkpoints/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned' +# config = AutoConfig.from_pretrained( +# self.llm_model_name, trust_remote_code=True +# ) +# self.llm_model = AutoModel.from_pretrained(self.llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True) +# self.llm2clip_tokenizer = AutoTokenizer.from_pretrained(self.llm_model_name) +# self.llm_model.config._name_or_path = '/data/xmyu/Bunny_all/checkpoints/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC +# self.l2v = LLM2Vec(self.llm_model, self.llm2clip_tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512) + +# self.is_loaded = True + +# def forward_img(self, images): +# if type(images) is list: +# image_features = [] +# for image in images: +# image_feature = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), +# output_hidden_states=True) +# image_features.append(image_feature) +# else: +# image_features = self.vision_tower(images.to(device=self.device, dtype=self.dtype), +# output_hidden_states=True) + +# return image_features + +# def forward(self, captions): + +# if type(captions) is list: +# caption_features = self.l2v.encode(captions, convert_to_tensor=True).to('cuda') + +# with torch.no_grad(), torch.cuda.amp.autocast(): + +# caption_features = self.model.get_text_features(caption_features) +# caption_features /= caption_features.norm(dim=-1, keepdim=True) + +# print('<-------------------------->') +# print(caption_features.shape) +# print('<-------------------------->') + +# return caption_features + +# @property +# def dummy_feature(self): +# return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) + +# @property +# def dtype(self): +# return self.vision_tower.dtype + +# @property +# def device(self): +# return self.vision_tower.device + +# @property +# def hidden_size(self): +# return 1280 + +# def build_vision_tower(vision_tower_cfg, **kwargs): + +# return LLM2CLIPTextTower() + + +class LLM2CLIPTextTower(nn.Module): + def __init__(self): + super().__init__() + + folder_path = '/data/xmyu/Bunny_all/data/embeddings/pkl/captions' # 替换为实际的文件夹路径 + self.embeddings = [] + + for filename in os.listdir(folder_path): + if filename.endswith('.pkl'): + file_path = os.path.join(folder_path, filename) + with open(file_path, 'rb') as f: + self.embeddings.extend(pickle.load(f)) + + def forward(self, ids): + embeds = [] + for id_ in ids: + embed = next((d['embed'] for d in self.embeddings if d['id'] == id_), None) + if embed is None: + raise KeyError(f"ID {id_} not found in embeddings.") + embeds.append(embed) + return embeds + + @property + def dummy_feature(self): + return torch.zeros(1, 1280, device=self.device, dtype=self.dtype) + + # @property + # def dtype(self): + # return self.vision_tower.dtype + + # @property + # def device(self): + # return self.vision_tower.device + + @property + def hidden_size(self): + return 1280 + +def build_vision_tower(vision_tower_cfg, **kwargs): + + return LLM2CLIPTextTower() \ No newline at end of file diff --git a/Unicorn_dense/bunny/model/multimodal_encoder/clip/__pycache__/clip_encoder.cpython-310.pyc b/Unicorn_dense/bunny/model/multimodal_encoder/clip/__pycache__/clip_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2941645f955d154af544b6837c3a344ba38e47a Binary files /dev/null and b/Unicorn_dense/bunny/model/multimodal_encoder/clip/__pycache__/clip_encoder.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/multimodal_encoder/clip/clip_encoder.py b/Unicorn_dense/bunny/model/multimodal_encoder/clip/clip_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..b499817af1cecf4b000eda5415d0436a691519c5 --- /dev/null +++ b/Unicorn_dense/bunny/model/multimodal_encoder/clip/clip_encoder.py @@ -0,0 +1,77 @@ +import torch +import torch.nn as nn + +from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig + + +class CLIPVisionTower(nn.Module): + def __init__(self, vision_tower, args, delay_load=False): + super().__init__() + + self.is_loaded = False + + self.vision_tower_name = vision_tower + self.select_layer = -2 + + if not delay_load: + self.load_model() + else: + self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) + + def load_model(self): + if self.is_loaded: + return + self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) + self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name) + self.vision_tower.requires_grad_(False) + + self.is_loaded = True + + def feature_select(self, image_forward_outs): + image_features = image_forward_outs.hidden_states[self.select_layer] + + image_features = image_features[:, 1:] + + return image_features + + def forward(self, images): + if type(images) is list: + image_features = [] + for image in images: + image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), + output_hidden_states=True) + image_feature = self.feature_select(image_forward_out).to(image.dtype) + image_features.append(image_feature) + else: + image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), + output_hidden_states=True) + image_features = self.feature_select(image_forward_outs).to(images.dtype) + + return image_features + + @property + def dummy_feature(self): + return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) + + @property + def dtype(self): + return self.vision_tower.dtype + + @property + def device(self): + return self.vision_tower.device + + @property + def config(self): + if self.is_loaded: + return self.vision_tower.config + else: + return self.cfg_only + + @property + def hidden_size(self): + return self.config.hidden_size + + @property + def num_patches(self): + return (self.config.image_size // self.config.patch_size) ** 2 diff --git a/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/__pycache__/eva_clip_encoder.cpython-310.pyc b/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/__pycache__/eva_clip_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3df5b8586d80f9c1c94ea43e5f129b4fe5cf7666 Binary files /dev/null and b/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/__pycache__/eva_clip_encoder.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/__pycache__/eva_clip_processors.cpython-310.pyc b/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/__pycache__/eva_clip_processors.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f49409090618ab5bf4131d6e81ff81df0cc7f60 Binary files /dev/null and b/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/__pycache__/eva_clip_processors.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/__pycache__/eva_vit.cpython-310.pyc b/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/__pycache__/eva_vit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..277ee98e8470ab6a93b48fa440541d097a089ea1 Binary files /dev/null and b/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/__pycache__/eva_vit.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/eva_clip_encoder.py b/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/eva_clip_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..adba2b21b105fe49a3a6a3c93114c9daaa30b5e4 --- /dev/null +++ b/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/eva_clip_encoder.py @@ -0,0 +1,64 @@ +import torch +import torch.nn as nn + +from .eva_clip_processors import EvaClipImageTrainProcessor +from .eva_vit import Eva2LargePlusEncoder + + +class EvaClipVisionTower(nn.Module): + def __init__(self, vision_tower, args, delay_load=False): + super().__init__() + + self.is_loaded = False + + self.vision_tower_path = vision_tower + self.config = VisionTowerConfig() + + if not delay_load: + self.load_model() + else: + self.cfg_only = self.config + + def load_model(self): + if self.is_loaded: + return + self.image_processor = EvaClipImageTrainProcessor(self.config.image_size) + self.vision_tower = Eva2LargePlusEncoder(self.vision_tower_path) + self.vision_tower.requires_grad_(False) + + self.is_loaded = True + + def forward(self, images): + if type(images) is list: + image_features = [] + for image in images: + image_feature = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0)).to( + image.dtype) + image_features.append(image_feature) + else: + image_features = self.vision_tower(images.to(device=self.device, dtype=self.dtype)).to(images.dtype) + + return image_features + + @property + def dtype(self): + return self.vision_tower.dtype + + @property + def device(self): + return self.vision_tower.device + + @property + def hidden_size(self): + return self.config.hidden_size + + @property + def num_patches(self): + return (self.config.image_size // self.config.patch_size) ** 2 + + +class VisionTowerConfig(): + def __init__(self): + self.image_size = 336 + self.patch_size = 14 + self.hidden_size = 1024 diff --git a/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/eva_clip_processors.py b/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/eva_clip_processors.py new file mode 100644 index 0000000000000000000000000000000000000000..26a56fa06462bf36a476a1fb3ce9285107884fcb --- /dev/null +++ b/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/eva_clip_processors.py @@ -0,0 +1,68 @@ +''' +# Adapted from https://github.com/baaivision/EVA/tree/master/EVA-CLIP +''' + +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode +from transformers.image_processing_utils import BatchFeature +from PIL import Image +from transformers.image_transforms import convert_to_rgb + + +class BaseProcessor: + def __init__(self): + self.transform = lambda x: x + return + + def __call__(self, item): + return self.transform(item) + + +class EvaClipImageBaseProcessor(BaseProcessor): + def __init__(self, mean=None, std=None): + self.mean = (0.48145466, 0.4578275, 0.40821073) if mean is None else mean + self.std = (0.26862954, 0.26130258, 0.27577711) if std is None else std + + self.normalize = transforms.Normalize(self.mean, self.std) + + @property + def image_mean(self): + return self.mean + + +class EvaClipImageTrainProcessor(EvaClipImageBaseProcessor): + def __init__(self, image_size=224, mean=None, std=None, min_scale=0.5, max_scale=1.0): + super().__init__(mean=mean, std=std) + + self.transform = transforms.Compose( + [ + convert_to_rgb, + transforms.Resize( + image_size, + interpolation=InterpolationMode.BICUBIC, + ), + transforms.CenterCrop(image_size), + transforms.ToTensor(), + self.normalize, + ] + ) + + self.image_size = image_size + + def preprocess(self, images, return_tensors): + if isinstance(images, Image.Image): + images = [images] + else: + assert isinstance(images, list) + + transformed_images = [self.transform(image).numpy() for image in images] + data = {"pixel_values": transformed_images} + + return BatchFeature(data=data, tensor_type=return_tensors) + + def __call__(self, item): + return self.transform(item) + + @property + def crop_size(self): + return {'height': self.image_size, 'width': self.image_size} diff --git a/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/eva_vit.py b/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/eva_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..3fe293c70d72911f2ac3b94c68bd58176e72a126 --- /dev/null +++ b/Unicorn_dense/bunny/model/multimodal_encoder/eva_clip/eva_vit.py @@ -0,0 +1,851 @@ +''' +# Adapted from https://github.com/baaivision/EVA/tree/master/EVA-CLIP +''' + +from math import pi +import torch +from torch import nn +from einops import rearrange, repeat +import logging + + +def broadcat(tensors, dim=-1): + num_tensors = len(tensors) + shape_lens = set(list(map(lambda t: len(t.shape), tensors))) + assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions' + shape_len = list(shape_lens)[0] + dim = (dim + shape_len) if dim < 0 else dim + dims = list(zip(*map(lambda t: list(t.shape), tensors))) + expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim] + assert all( + [*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation' + max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims)) + expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims)) + expanded_dims.insert(dim, (dim, dims[dim])) + expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims))) + tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes))) + return torch.cat(tensors, dim=dim) + + +def rotate_half(x): + x = rearrange(x, '... (d r) -> ... d r', r=2) + x1, x2 = x.unbind(dim=-1) + x = torch.stack((-x2, x1), dim=-1) + return rearrange(x, '... d r -> ... (d r)') + + +class VisionRotaryEmbeddingFast(nn.Module): + def __init__( + self, + dim, + pt_seq_len, + ft_seq_len=None, + custom_freqs=None, + freqs_for='lang', + theta=10000, + max_freq=10, + num_freqs=1, + patch_dropout=0. + ): + super().__init__() + if custom_freqs: + freqs = custom_freqs + elif freqs_for == 'lang': + freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim)) + elif freqs_for == 'pixel': + freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi + elif freqs_for == 'constant': + freqs = torch.ones(num_freqs).float() + else: + raise ValueError(f'unknown modality {freqs_for}') + + if ft_seq_len is None: ft_seq_len = pt_seq_len + t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len + + freqs = torch.einsum('..., f -> ... f', t, freqs) + freqs = repeat(freqs, '... n -> ... (n r)', r=2) + freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim=-1) + + freqs_cos = freqs.cos().view(-1, freqs.shape[-1]) + freqs_sin = freqs.sin().view(-1, freqs.shape[-1]) + + self.patch_dropout = patch_dropout + + self.register_buffer("freqs_cos", freqs_cos) + self.register_buffer("freqs_sin", freqs_sin) + + logging.info(f'Shape of rope freq: {self.freqs_cos.shape}') + + def forward(self, t, patch_indices_keep=None): + if patch_indices_keep is not None: + batch = t.size()[0] + batch_indices = torch.arange(batch) + batch_indices = batch_indices[..., None] + + freqs_cos = repeat(self.freqs_cos, 'i j -> n i m j', n=t.shape[0], m=t.shape[1]) + freqs_sin = repeat(self.freqs_sin, 'i j -> n i m j', n=t.shape[0], m=t.shape[1]) + + freqs_cos = freqs_cos[batch_indices, patch_indices_keep] + freqs_cos = rearrange(freqs_cos, 'n i m j -> n m i j') + freqs_sin = freqs_sin[batch_indices, patch_indices_keep] + freqs_sin = rearrange(freqs_sin, 'n i m j -> n m i j') + + return t * freqs_cos + rotate_half(t) * freqs_sin + + return t * self.freqs_cos + rotate_half(t) * self.freqs_sin + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm (with cast back to input dtype).""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + return x.to(orig_type) + + +class PatchDropout(nn.Module): + """ + https://arxiv.org/abs/2212.00794 + """ + + def __init__(self, prob, exclude_first_token=True): + super().__init__() + assert 0 <= prob < 1. + self.prob = prob + self.exclude_first_token = exclude_first_token # exclude CLS token + logging.info(f"os.getenv('RoPE')={os.getenv('RoPE')}") + + def forward(self, x): + if not self.training or self.prob == 0.: + return x + + if self.exclude_first_token: + cls_tokens, x = x[:, :1], x[:, 1:] + else: + cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1]) + + batch = x.size()[0] + num_tokens = x.size()[1] + + batch_indices = torch.arange(batch) + batch_indices = batch_indices[..., None] + + keep_prob = 1 - self.prob + num_patches_keep = max(1, int(num_tokens * keep_prob)) + + rand = torch.randn(batch, num_tokens) + patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices + + x = x[batch_indices, patch_indices_keep] + + if self.exclude_first_token: + x = torch.cat((cls_tokens, x), dim=1) + + if self.training and os.getenv('RoPE') == '1': + return x, patch_indices_keep + + return x + + +# -------------------------------------------------------- +# Adapted from https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- +import math +import os +from functools import partial +import torch.nn as nn +import torch.nn.functional as F + +try: + from timm.models.layers import drop_path, to_2tuple, trunc_normal_ +except: + from timm.layers import drop_path, to_2tuple, trunc_normal_ + +if os.getenv('ENV_TYPE') == 'deepspeed': + try: + from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint + except: + from torch.utils.checkpoint import checkpoint +else: + from torch.utils.checkpoint import checkpoint + +import xformers.ops as xops + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return 'p={}'.format(self.drop_prob) + + +class Mlp(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + drop=0., + subln=False, + + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + + self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity() + + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + # x = self.drop(x) + # commit this for the orignal BERT implement + x = self.ffn_ln(x) + + x = self.fc2(x) + x = self.drop(x) + return x + + +class SwiGLU(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0., + norm_layer=nn.LayerNorm, subln=False): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.w1 = nn.Linear(in_features, hidden_features) + self.w2 = nn.Linear(in_features, hidden_features) + + self.act = act_layer() + self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity() + self.w3 = nn.Linear(hidden_features, out_features) + + self.drop = nn.Dropout(drop) + + def forward(self, x): + x1 = self.w1(x) + x2 = self.w2(x) + hidden = self.act(x1) * x2 + x = self.ffn_ln(hidden) + x = self.w3(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__( + self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., + proj_drop=0., window_size=None, attn_head_dim=None, xattn=False, rope=None, subln=False, + norm_layer=nn.LayerNorm): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.subln = subln + if self.subln: + self.q_proj = nn.Linear(dim, all_head_dim, bias=False) + self.k_proj = nn.Linear(dim, all_head_dim, bias=False) + self.v_proj = nn.Linear(dim, all_head_dim, bias=False) + else: + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) + + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.v_bias = None + + if window_size: + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = \ + torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + else: + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.inner_attn_ln = norm_layer(all_head_dim) if subln else nn.Identity() + # self.proj = nn.Linear(all_head_dim, all_head_dim) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.xattn = xattn + self.xattn_drop = attn_drop + + self.rope = rope + + def forward(self, x, rel_pos_bias=None, attn_mask=None): + B, N, C = x.shape + if self.subln: + q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias) + k = F.linear(input=x, weight=self.k_proj.weight, bias=None) + v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias) + + q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) # B, num_heads, N, C + k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + else: + + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) + + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) # 3, B, num_heads, N, C + q, k, v = qkv[0], qkv[1], qkv[2] + + if self.rope: + # slightly fast impl + q_t = q[:, :, 1:, :] + ro_q_t = self.rope(q_t) + q = torch.cat((q[:, :, :1, :], ro_q_t), -2).type_as(v) + + k_t = k[:, :, 1:, :] + ro_k_t = self.rope(k_t) + k = torch.cat((k[:, :, :1, :], ro_k_t), -2).type_as(v) + + if self.xattn: + q = q.permute(0, 2, 1, 3) # B, num_heads, N, C -> B, N, num_heads, C + k = k.permute(0, 2, 1, 3) + v = v.permute(0, 2, 1, 3) + + x = xops.memory_efficient_attention( + q, k, v, + p=self.xattn_drop, + scale=self.scale, + ) + x = x.reshape(B, N, -1) + x = self.inner_attn_ln(x) + x = self.proj(x) + x = self.proj_drop(x) + else: + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + if self.relative_position_bias_table is not None: + relative_position_bias = \ + self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0).type_as(attn) + + if rel_pos_bias is not None: + attn = attn + rel_pos_bias.type_as(attn) + + if attn_mask is not None: + attn_mask = attn_mask.bool() + attn = attn.masked_fill(~attn_mask[:, None, None, :], float("-inf")) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.inner_attn_ln(x) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm, + window_size=None, attn_head_dim=None, xattn=False, rope=None, postnorm=False, + subln=False, naiveswiglu=False): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim, + xattn=xattn, rope=rope, subln=subln, norm_layer=norm_layer) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + + if naiveswiglu: + self.mlp = SwiGLU( + in_features=dim, + hidden_features=mlp_hidden_dim, + subln=subln, + norm_layer=norm_layer, + ) + else: + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + subln=subln, + drop=drop + ) + + if init_values is not None and init_values > 0: + self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True) + self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True) + else: + self.gamma_1, self.gamma_2 = None, None + + self.postnorm = postnorm + + def forward(self, x, rel_pos_bias=None, attn_mask=None): + if self.gamma_1 is None: + if self.postnorm: + x = x + self.drop_path(self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))) + x = x + self.drop_path(self.norm2(self.mlp(x))) + else: + x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + if self.postnorm: + x = x + self.drop_path( + self.gamma_1 * self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))) + x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x))) + else: + x = x + self.drop_path( + self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x, **kwargs): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose(1, 2) + return x + + +class RelativePositionBias(nn.Module): + + def __init__(self, window_size, num_heads): + super().__init__() + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = \ + torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + + def forward(self): + relative_position_bias = \ + self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + + +class EVAVisionTransformer(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, patch_dropout=0., + use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, rope=False, + use_mean_pooling=True, init_scale=0.001, grad_checkpointing=False, xattn=False, postnorm=False, + pt_hw_seq_len=16, intp_freq=False, naiveswiglu=False, subln=False): + super().__init__() + self.image_size = img_size + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + if use_abs_pos_emb: + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + else: + self.pos_embed = None + self.pos_drop = nn.Dropout(p=drop_rate) + + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads) + else: + self.rel_pos_bias = None + + if rope: + half_head_dim = embed_dim // num_heads // 2 + hw_seq_len = img_size // patch_size + self.rope = VisionRotaryEmbeddingFast( + dim=half_head_dim, + pt_seq_len=pt_hw_seq_len, + ft_seq_len=hw_seq_len if intp_freq else None, + # patch_dropout=patch_dropout + ) + else: + self.rope = None + + self.naiveswiglu = naiveswiglu + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.use_rel_pos_bias = use_rel_pos_bias + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, + xattn=xattn, rope=self.rope, postnorm=postnorm, subln=subln, naiveswiglu=naiveswiglu) + for i in range(depth)]) + self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim) + self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + if self.pos_embed is not None: + trunc_normal_(self.pos_embed, std=.02) + + trunc_normal_(self.cls_token, std=.02) + # trunc_normal_(self.mask_token, std=.02) + + self.apply(self._init_weights) + self.fix_init_weight() + + if isinstance(self.head, nn.Linear): + trunc_normal_(self.head.weight, std=.02) + self.head.weight.data.mul_(init_scale) + self.head.bias.data.mul_(init_scale) + + # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn + self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity() + + self.grad_checkpointing = grad_checkpointing + + def fix_init_weight(self): + def rescale(param, layer_id): + param.div_(math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight.data, layer_id + 1) + if self.naiveswiglu: + rescale(layer.mlp.w3.weight.data, layer_id + 1) + else: + rescale(layer.mlp.fc2.weight.data, layer_id + 1) + + def get_cast_dtype(self) -> torch.dtype: + return self.blocks[0].mlp.fc2.weight.dtype + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def get_num_layers(self): + return len(self.blocks) + + def lock(self, unlocked_groups=0, freeze_bn_stats=False): + assert unlocked_groups == 0, 'partial locking not currently supported for this model' + for param in self.parameters(): + param.requires_grad = False + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x, return_all_features=False): + + x = self.patch_embed(x) + batch_size, seq_len, _ = x.size() + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in + if os.getenv('RoPE') == '1': + if self.training and not isinstance(self.patch_dropout, nn.Identity): + x, patch_indices_keep = self.patch_dropout(x) + self.rope.forward = partial(self.rope.forward, patch_indices_keep=patch_indices_keep) + else: + self.rope.forward = partial(self.rope.forward, patch_indices_keep=None) + x = self.patch_dropout(x) + else: + x = self.patch_dropout(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for i, blk in enumerate(self.blocks): + if i == len(self.blocks) - 1: + continue + if self.grad_checkpointing: + x = checkpoint(blk, x, (rel_pos_bias,)) + else: + x = blk(x, rel_pos_bias=rel_pos_bias) + + if not return_all_features: + x = self.norm(x) + if self.fc_norm is not None: + return self.fc_norm(x.mean(1)) + else: + return x[:, 0] + return x + + def forward(self, x, return_all_features=False): + if return_all_features: + return self.forward_features(x, return_all_features) + x = self.forward_features(x) + x = self.head(x) + return x + + +def load_state_dict(checkpoint_path: str, map_location: str = 'cpu', model_key: str = 'model|module|state_dict', + is_openai: bool = False, skip_list: list = []): + if is_openai: + model = torch.jit.load(checkpoint_path, map_location="cpu").eval() + state_dict = model.state_dict() + for key in ["input_resolution", "context_length", "vocab_size"]: + state_dict.pop(key, None) + else: + checkpoint = torch.load(checkpoint_path, map_location=map_location) + for mk in model_key.split('|'): + if isinstance(checkpoint, dict) and mk in checkpoint: + state_dict = checkpoint[mk] + break + else: + state_dict = checkpoint + if next(iter(state_dict.items()))[0].startswith('module'): + state_dict = {k[7:]: v for k, v in state_dict.items()} + + for k in skip_list: + if k in list(state_dict.keys()): + logging.info(f"Removing key {k} from pretrained checkpoint") + del state_dict[k] + + if os.getenv('RoPE') == '1': + for k in list(state_dict.keys()): + if 'freqs_cos' in k or 'freqs_sin' in k: + del state_dict[k] + return state_dict + + +def load_clip_visual_state_dict(checkpoint_path: str, map_location: str = 'cpu', is_openai: bool = False, + skip_list: list = []): + state_dict = load_state_dict(checkpoint_path, map_location=map_location, is_openai=is_openai, skip_list=skip_list) + + for k in list(state_dict.keys()): + if not k.startswith('visual.'): + del state_dict[k] + for k in list(state_dict.keys()): + if k.startswith('visual.'): + new_k = k[7:] + state_dict[new_k] = state_dict[k] + del state_dict[k] + return state_dict + + +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +try: + from apex.normalization import FusedLayerNorm +except: + FusedLayerNorm = LayerNorm + print( + "Please build and install Nvidia apex package with option '--cuda_ext' according to https://github.com/NVIDIA/apex#from-source .") + + +@dataclass +class CLIPVisionCfg: + layers: Union[Tuple[int, int, int, int], int] = 12 + width: int = 768 + head_width: int = 64 + mlp_ratio: float = 4.0 + patch_size: int = 16 + image_size: Union[Tuple[int, int], int] = 224 + ls_init_value: Optional[float] = None # layer scale initial value + patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results + global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580) + drop_path_rate: Optional[float] = None # drop path rate + timm_model_name: str = None # a valid model name overrides layers, width, patch_size + timm_model_pretrained: bool = False # use (imagenet) pretrained weights for named model + timm_pool: str = 'avg' # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '') + timm_proj: str = 'linear' # linear projection for timm model output ('linear', 'mlp', '') + timm_proj_bias: bool = False # enable bias final projection + eva_model_name: str = None # a valid eva model name overrides layers, width, patch_size + qkv_bias: bool = True + fusedLN: bool = False + xattn: bool = False + postnorm: bool = False + rope: bool = False + pt_hw_seq_len: int = 16 # 224/14 + intp_freq: bool = False + naiveswiglu: bool = False + subln: bool = False + + +def _build_vision_tower( + vision_tower_path: str, + embed_dim: int, + vision_cfg: CLIPVisionCfg +): + if isinstance(vision_cfg, dict): + vision_cfg = CLIPVisionCfg(**vision_cfg) + + if vision_cfg.eva_model_name: + vision_heads = vision_cfg.width // vision_cfg.head_width + norm_layer = LayerNorm + + visual = EVAVisionTransformer( + img_size=vision_cfg.image_size, + patch_size=vision_cfg.patch_size, + num_classes=embed_dim, + use_mean_pooling=vision_cfg.global_average_pool, # False + init_values=vision_cfg.ls_init_value, + patch_dropout=vision_cfg.patch_dropout, + embed_dim=vision_cfg.width, + depth=vision_cfg.layers, + num_heads=vision_heads, + mlp_ratio=vision_cfg.mlp_ratio, + qkv_bias=vision_cfg.qkv_bias, + drop_path_rate=vision_cfg.drop_path_rate, + norm_layer=partial(FusedLayerNorm, eps=1e-6) if vision_cfg.fusedLN else partial(norm_layer, eps=1e-6), + xattn=vision_cfg.xattn, + rope=vision_cfg.rope, + postnorm=vision_cfg.postnorm, + pt_hw_seq_len=vision_cfg.pt_hw_seq_len, # 224/14 + intp_freq=vision_cfg.intp_freq, + naiveswiglu=vision_cfg.naiveswiglu, + subln=vision_cfg.subln + ) + + state_dict = load_clip_visual_state_dict(vision_tower_path) + incompatible_keys = visual.load_state_dict(state_dict, strict=False) + print('EVA-CLIP incompatible_keys:', incompatible_keys) + + return visual + + +class Eva2LargePlusEncoder(nn.Module): + def __init__(self, vision_tower_path): + super(Eva2LargePlusEncoder, self).__init__() + self.config = { + "embed_dim": 768, + "vision_cfg": { + "image_size": 336, + "layers": 24, + "width": 1024, + "drop_path_rate": 0, + "head_width": 64, + "mlp_ratio": 2.6667, + "patch_size": 14, + "eva_model_name": "eva-clip-l-14-336", + "xattn": True, + "fusedLN": True, + "rope": True, + "pt_hw_seq_len": 16, + "intp_freq": True, + "naiveswiglu": True, + "subln": True + } + } + + self.config['vision_tower_path'] = vision_tower_path + self.model = _build_vision_tower(**self.config) + + def forward(self, image, **kwargs): + encode = self.model(image, return_all_features=True)[:, 1:, :] + return encode + + @property + def dtype(self): + return list(self.parameters())[-1].dtype + + @property + def device(self): + return list(self.parameters())[-1].device diff --git a/Unicorn_dense/bunny/model/multimodal_encoder/llm2clip/llm2clip_encoder b/Unicorn_dense/bunny/model/multimodal_encoder/llm2clip/llm2clip_encoder new file mode 100644 index 0000000000000000000000000000000000000000..f93d071318637d3ba738cd6e75d39fceb61f57ee --- /dev/null +++ b/Unicorn_dense/bunny/model/multimodal_encoder/llm2clip/llm2clip_encoder @@ -0,0 +1,62 @@ +import torch +import torch.nn as nn + +from PIL import Image +from transformers import AutoModel, AutoConfig, AutoTokenizer +from transformers import CLIPImageProcessor +import torch +from llm2vec import LLM2Vec + + +class LLM2CLIPTextTower(nn.Module): + def __init__(self): + super().__init__() + + self.image_processor = CLIPImageProcessor.from_pretrained("/data/xmyu/Bunny_all/checkpoints/clip-vit-large-patch14-336") + self.vision_tower = AutoModel.from_pretrained( + "/data/xmyu/Bunny_all/checkpoints/LLM2CLIP-Openai-L-14-336", + torch_dtype=torch.bfloat16, + trust_remote_code=True).to('cuda').eval() + + self.vision_tower.requires_grad_(False) + + + self.llm_model_name = '/data/xmyu/Bunny_all/checkpoints/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned' + config = AutoConfig.from_pretrained( + self.llm_model_name, trust_remote_code=True + ) + self.llm_model = AutoModel.from_pretrained(self.llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True) + self.llm2clip_tokenizer = AutoTokenizer.from_pretrained(self.llm_model_name) + self.llm_model.config._name_or_path = '/data/xmyu/Bunny_all/checkpoints/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC + self.l2v = LLM2Vec(self.llm_model, self.llm2clip_tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512) + + self.is_loaded = True + + def forward(self, images): + if type(images) is list: + image_features = [] + for image in images: + image_feature = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), + output_hidden_states=True) + image_features.append(image_feature) + else: + image_features = self.vision_tower(images.to(device=self.device, dtype=self.dtype), + output_hidden_states=True) + + return image_features + + @property + def dummy_feature(self): + return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) + + @property + def dtype(self): + return self.vision_tower.dtype + + @property + def device(self): + return self.vision_tower.device + + @property + def hidden_size(self): + return 1280 \ No newline at end of file diff --git a/Unicorn_dense/bunny/model/multimodal_encoder/siglip/__pycache__/siglip_encoder.cpython-310.pyc b/Unicorn_dense/bunny/model/multimodal_encoder/siglip/__pycache__/siglip_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c34427aaa8a9567ba09d42ac0b4f6d9ecad88e9 Binary files /dev/null and b/Unicorn_dense/bunny/model/multimodal_encoder/siglip/__pycache__/siglip_encoder.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/multimodal_encoder/siglip/siglip_encoder.py b/Unicorn_dense/bunny/model/multimodal_encoder/siglip/siglip_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..4f297e25d92412bfc16da6108751c3cc147f0ebc --- /dev/null +++ b/Unicorn_dense/bunny/model/multimodal_encoder/siglip/siglip_encoder.py @@ -0,0 +1,130 @@ +import torch +import torch.nn as nn + +from transformers import SiglipVisionModel, SiglipImageProcessor, SiglipVisionConfig +from bunny.util.s2wrapper import forward as multiscale_forward + + +class SiglipVisionTower(nn.Module): + def __init__(self, vision_tower, args, delay_load=False): + super().__init__() + + self.is_loaded = False + + self.vision_tower_name = vision_tower + self.select_layer = -2 + + if not delay_load: + self.load_model() + else: + self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name) + + def load_model(self): + if self.is_loaded: + return + self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name) + self.image_processor.crop_size = self.image_processor.size + self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name) + self.vision_tower.requires_grad_(False) + + self.is_loaded = True + + def feature_select(self, image_forward_outs): + image_features = image_forward_outs.hidden_states[self.select_layer] + + return image_features + + def forward(self, images): + if type(images) is list: + image_features = [] + for image in images: + image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), + output_hidden_states=True) + image_feature = self.feature_select(image_forward_out).to(image.dtype) + image_features.append(image_feature) + else: + image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), + output_hidden_states=True) + image_features = self.feature_select(image_forward_outs).to(images.dtype) + + return image_features + + @property + def dummy_feature(self): + return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) + + @property + def dtype(self): + return self.vision_tower.dtype + + @property + def device(self): + return self.vision_tower.device + + @property + def config(self): + if self.is_loaded: + return self.vision_tower.config + else: + return self.cfg_only + + @property + def hidden_size(self): + return self.config.hidden_size + + @property + def num_patches(self): + return (self.config.image_size // self.config.patch_size) ** 2 + + +class SiglipVisionTowerS2(SiglipVisionTower): + def __init__(self, vision_tower, args, delay_load=False): + self.s2_scales = getattr(args, 's2_scales', '384,768,1152') + self.s2_scales = list(map(int, self.s2_scales.split(','))) + self.s2_scales.sort() + self.s2_split_size = self.s2_scales[0] + self.s2_image_size = self.s2_scales[-1] + + super().__init__(vision_tower, args, delay_load) + + self.multiscale_forward = multiscale_forward + + if not delay_load: + self.image_processor.size['height'] = self.image_processor.size['width'] = self.s2_image_size + self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size + + def load_model(self): + if self.is_loaded: + return + self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name) + self.image_processor.crop_size = self.image_processor.size + self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name) + self.vision_tower.requires_grad_(False) + + self.image_processor.size['height'] = self.image_processor.size['width'] = self.s2_image_size + self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size + + self.is_loaded = True + + def forward_feature(self, images): + image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), + output_hidden_states=True) + image_features = self.feature_select(image_forward_outs).to(images.dtype) + return image_features + + def forward(self, images): + if type(images) is list: + image_features = [] + for image in images: + image_feature = self.multiscale_forward(self.forward_feature, image.unsqueeze(0), + img_sizes=self.s2_scales, max_split_size=self.s2_split_size) + image_features.append(image_feature) + else: + image_features = self.multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, + max_split_size=self.s2_split_size) + + return image_features + + @property + def hidden_size(self): + return self.config.hidden_size * len(self.s2_scales) diff --git a/Unicorn_dense/bunny/model/multimodal_projector/__pycache__/builder.cpython-310.pyc b/Unicorn_dense/bunny/model/multimodal_projector/__pycache__/builder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32d0c8278f44a1042daf9b39c1645ed5e1482ee7 Binary files /dev/null and b/Unicorn_dense/bunny/model/multimodal_projector/__pycache__/builder.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/model/multimodal_projector/builder.py b/Unicorn_dense/bunny/model/multimodal_projector/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..51eb0955cff95e1ef9405497dee0063a4a17c52d --- /dev/null +++ b/Unicorn_dense/bunny/model/multimodal_projector/builder.py @@ -0,0 +1,194 @@ +import re +import math +from torch import nn +from functools import partial +from timm.layers.norm_act import LayerNormAct2d +from torchvision.ops.misc import SqueezeExcitation as SELayer +from torchvision.models.mobilenetv3 import InvertedResidual, InvertedResidualConfig + + +class IdentityMap(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, *args, **kwargs): + print('') + return x + + @property + def config(self): + return {"mm_projector_type": 'identity'} + + +class Minigpt(nn.Module): + def __init__(self, config=None): + super(Minigpt, self).__init__() + # c*4 is the input size, and c is the output size for the linear layer + inc, ouc = config.mm_hidden_size, config.hidden_size + self.linear = nn.Linear(inc * 4, ouc) + + def forward(self, x): + print('') + # x is the input tensor with shape [b, num_tokens, c] + b, num_tokens, c = x.shape + + # Check if num_tokens is divisible by 4 + if num_tokens % 4 != 0: + raise ValueError("num_tokens must be divisible by 4") + + # Reshape x to [b, num_tokens/4, c*4] + x = x.view(b, num_tokens // 4, c * 4) + + # Apply the linear transformation + x = self.linear(x) + return x + + +class Vanilla(nn.Module): + def __init__(self, config=None): + super(Vanilla, self).__init__() + # c*4 is the input size, and c is the output size for the linear layer + inc, ouc = config.mm_hidden_size, config.hidden_size + self.linear = nn.Linear(inc * 4, ouc) + + def forward(self, x): + print('') + b, num_tokens, c = x.shape + + # Check if num_tokens is divisible by 4 + if num_tokens % 4 != 0: + raise ValueError("num_tokens must be divisible by 4") + + # First, reshape to [b, num_tokens//4, 4, c] + x = x.view(b, num_tokens // 4, 4, c) + + # Then, permute to interleave the tokens + x = x.permute(0, 1, 3, 2).contiguous() + + # Finally, reshape to [b, num_tokens//4, c*4] to interleave features of 4 tokens + x = x.view(b, num_tokens // 4, c * 4) + + # Apply the linear transformation + x = self.linear(x) + return x + + +class LDPBlock(nn.Module): + # Lightweight Downsample Projector Block + + def __init__(self, config=None): + super().__init__() + + inc, ouc = config.mm_hidden_size, config.hidden_size + layer_norm = partial(LayerNormAct2d, act_layer=None) + se_layer = partial(SELayer, scale_activation=nn.Hardsigmoid) + self.mlp = nn.Sequential( + nn.Identity(), nn.Linear(inc, ouc), nn.GELU(), nn.Linear(ouc, ouc) + ) + self.mb_block = nn.Sequential( + nn.Identity(), + InvertedResidual(InvertedResidualConfig(ouc, 3, ouc, ouc, True, "HS", 1, 1, 1), layer_norm, se_layer), + InvertedResidual(InvertedResidualConfig(ouc, 3, ouc, ouc, True, "HS", 2, 1, 1), layer_norm, se_layer) + ) + + def forward(self, x): + + print('') + b, num_tokens, c = x.shape + h = int(math.sqrt(num_tokens)) + x = self.mlp(x) + x = x.permute(0, 2, 1).reshape(b, -1, h, h) + x = self.mb_block(x) + x = x.flatten(2).permute(0, 2, 1) + return x + + +class LDPNetProjector(nn.Module): + + def __init__(self, config=None): + super().__init__() + self.model = LDPBlock(config) + + def forward(self, x): + print('') + return self.model(x) + + +class SPP(nn.Module): + + def __init__(self, config=None, projector_type='v1'): + super().__init__() + + self.projector_type = projector_type + + inc, ouc = config.mm_hidden_size, config.hidden_size + self.linear_0 = nn.Linear(inc, inc) + + self.linear_1 = nn.Linear(inc, ouc) + + self.pooling = nn.AvgPool2d(kernel_size=2) + + self.linear_2 = nn.Linear(ouc, ouc) + + def forward(self, x): + print('') + b, num_tokens, c = x.shape + h = int(math.sqrt(num_tokens)) + if 'v1' in self.projector_type: + x = self.linear_1(x) + x = x.permute(0, 2, 1).reshape(b, -1, h, h) + x = self.pooling(x) + x = x.flatten(2).permute(0, 2, 1) + x = self.linear_2(x) + elif 'v2' in self.projector_type: + x = self.linear_1(x) + x = self.linear_2(x) + x = x.permute(0, 2, 1).reshape(b, -1, h, h) + x = self.pooling(x) + x = x.flatten(2).permute(0, 2, 1) + elif 'v3' in self.projector_type: + x = self.linear_0(x) + x = x.permute(0, 2, 1).reshape(b, -1, h, h) + x = self.pooling(x) + x = x.flatten(2).permute(0, 2, 1) + x = self.linear_1(x) + x = self.linear_2(x) + return x + + +def build_vision_projector(config, delay_load=False, **kwargs): + + # print('') + + projector_type = getattr(config, 'mm_projector_type', 'mlp2x_gelu') + + if projector_type == 'linear': + return nn.Linear(config.mm_hidden_size, config.hidden_size) + + elif projector_type.startswith('mlp'): + # print('') + mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) + if mlp_gelu_match: + mlp_depth = int(mlp_gelu_match.group(1)) + modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] + for _ in range(1, mlp_depth): + modules.append(nn.GELU()) + modules.append(nn.Linear(config.hidden_size, config.hidden_size)) + return nn.Sequential(*modules) + + elif projector_type.startswith('spp'): + return SPP(config, projector_type) + + elif projector_type == 'ldp': + return LDPNetProjector(config) + + elif projector_type == 'vanilla': + return Vanilla(config) + + elif projector_type == 'minigpt': + return Minigpt(config) + + elif projector_type == 'identity': + return IdentityMap() + + raise ValueError(f'Unknown projector type: {projector_type}') diff --git a/Unicorn_dense/bunny/train/.DS_Store b/Unicorn_dense/bunny/train/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/Unicorn_dense/bunny/train/.DS_Store differ diff --git a/Unicorn_dense/bunny/train/__pycache__/bunny_trainer.cpython-310.pyc b/Unicorn_dense/bunny/train/__pycache__/bunny_trainer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4c12fbdb4abf687a44efc67db4bd8612c272031 Binary files /dev/null and b/Unicorn_dense/bunny/train/__pycache__/bunny_trainer.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/train/bunny_trainer.py b/Unicorn_dense/bunny/train/bunny_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..840e9421c674b011bed81589c86f27665cb70e70 --- /dev/null +++ b/Unicorn_dense/bunny/train/bunny_trainer.py @@ -0,0 +1,257 @@ +import os +import torch + +from torch.utils.data import Sampler +from torch import nn +from transformers import Trainer +from transformers.trainer import is_sagemaker_mp_enabled, get_parameter_names, has_length, ALL_LAYERNORM_LAYERS, logger + +from typing import List, Optional + + +def maybe_zero_3(param, ignore_status=False, name=None): + from deepspeed import zero + from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus + if hasattr(param, "ds_id"): + if param.ds_status == ZeroParamStatus.NOT_AVAILABLE: + if not ignore_status: + print(name, 'no ignore status') + with zero.GatheredParameters([param]): + param = param.data.detach().cpu().clone() + else: + param = param.detach().cpu().clone() + return param + + +def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match): + to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)} + to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()} + return to_return + + +def split_to_even_chunks(indices, lengths, num_chunks): + """ + Split a list of indices into `chunks` chunks of roughly equal lengths. + """ + + if len(indices) % num_chunks != 0: + return [indices[i::num_chunks] for i in range(num_chunks)] + + num_indices_per_chunk = len(indices) // num_chunks + + chunks = [[] for _ in range(num_chunks)] + chunks_lengths = [0 for _ in range(num_chunks)] + for index in indices: + shortest_chunk = chunks_lengths.index(min(chunks_lengths)) + chunks[shortest_chunk].append(index) + chunks_lengths[shortest_chunk] += lengths[index] + if len(chunks[shortest_chunk]) == num_indices_per_chunk: + chunks_lengths[shortest_chunk] = float("inf") + + return chunks + + +def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None): + # We need to use torch for the random part as a distributed sampler will set the random seed for torch. + assert all(l != 0 for l in lengths), "Should not have zero length." + if all(l > 0 for l in lengths) or all(l < 0 for l in lengths): + # all samples are in the same modality + return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator) + mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0]) + lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0]) + + mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)] + lang_shuffle = [lang_indices[i] for i in + get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)] + megabatch_size = world_size * batch_size + mm_megabatches = [mm_shuffle[i: i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)] + lang_megabatches = [lang_shuffle[i: i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)] + + last_mm = mm_megabatches[-1] + last_lang = lang_megabatches[-1] + additional_batch = last_mm + last_lang + megabatches = mm_megabatches[:-1] + lang_megabatches[:-1] + megabatch_indices = torch.randperm(len(megabatches), generator=generator) + megabatches = [megabatches[i] for i in megabatch_indices] + + if len(additional_batch) > 0: + megabatches.append(sorted(additional_batch)) + + return [i for megabatch in megabatches for i in megabatch] + + +def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True): + # We need to use torch for the random part as a distributed sampler will set the random seed for torch. + indices = torch.randperm(len(lengths), generator=generator) + megabatch_size = world_size * batch_size + megabatches = [indices[i: i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)] + megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches] + megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches] + + return [i for megabatch in megabatches for batch in megabatch for i in batch] + + +class LengthGroupedSampler(Sampler): + r""" + Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while + keeping a bit of randomness. + """ + + def __init__( + self, + batch_size: int, + world_size: int, + lengths: Optional[List[int]] = None, + generator=None, + group_by_modality: bool = False, + ): + if lengths is None: + raise ValueError("Lengths must be provided.") + + self.batch_size = batch_size + self.world_size = world_size + self.lengths = lengths + self.generator = generator + self.group_by_modality = group_by_modality + + def __len__(self): + return len(self.lengths) + + def __iter__(self): + if self.group_by_modality: + indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, + generator=self.generator) + else: + indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, + generator=self.generator) + return iter(indices) + + +class BunnyTrainer(Trainer): + + def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: + if self.train_dataset is None or not has_length(self.train_dataset): + return None + + if self.args.group_by_modality_length: + lengths = self.train_dataset.modality_lengths + return LengthGroupedSampler( + self.args.train_batch_size, + world_size=self.args.world_size * self.args.gradient_accumulation_steps, + lengths=lengths, + group_by_modality=True, + ) + else: + return super()._get_train_sampler() + + def create_optimizer(self): + """ + Setup the optimizer. + + We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the + Trainer's init through `optimizers`, or subclass and override this method in a subclass. + """ + if is_sagemaker_mp_enabled(): + return super().create_optimizer() + + opt_model = self.model + + if self.optimizer is None: + decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS) + decay_parameters = [name for name in decay_parameters if "bias" not in name] + if self.args.mm_projector_lr is not None: + projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name or "vision_tower" in name] + optimizer_grouped_parameters = [ + { + "params": [ + p for n, p in opt_model.named_parameters() if + (n in decay_parameters and n not in projector_parameters and p.requires_grad) + ], + "weight_decay": self.args.weight_decay, + }, + { + "params": [ + p for n, p in opt_model.named_parameters() if + (n not in decay_parameters and n not in projector_parameters and p.requires_grad) + ], + "weight_decay": 0.0, + }, + { + "params": [ + p for n, p in opt_model.named_parameters() if + (n in decay_parameters and n in projector_parameters and p.requires_grad) + ], + "weight_decay": self.args.weight_decay, + "lr": self.args.mm_projector_lr, + }, + { + "params": [ + p for n, p in opt_model.named_parameters() if + (n not in decay_parameters and n in projector_parameters and p.requires_grad) + ], + "weight_decay": 0.0, + "lr": self.args.mm_projector_lr, + }, + ] + else: + optimizer_grouped_parameters = [ + { + "params": [ + p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad) + ], + "weight_decay": self.args.weight_decay, + }, + { + "params": [ + p for n, p in opt_model.named_parameters() if + (n not in decay_parameters and p.requires_grad) + ], + "weight_decay": 0.0, + }, + ] + + optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args) + + self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) + if optimizer_cls.__name__ == "Adam8bit": + import bitsandbytes + + manager = bitsandbytes.optim.GlobalOptimManager.get_instance() + + skipped = 0 + for module in opt_model.modules(): + if isinstance(module, nn.Embedding): + skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values()) + logger.info(f"skipped {module}: {skipped / 2 ** 20}M params") + manager.register_module_override(module, "weight", {"optim_bits": 32}) + logger.debug(f"bitsandbytes: will optimize {module} in fp32") + logger.info(f"skipped: {skipped / 2 ** 20}M params") + + return self.optimizer + + def _save_checkpoint(self, model, trial, metrics=None): + if getattr(self.args, 'tune_mm_mlp_adapter', False): + from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR + checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}" + + run_dir = self._get_output_dir(trial=trial) + output_dir = os.path.join(run_dir, checkpoint_folder) + + # Only save Adapter + keys_to_match = ['mm_projector', 'vision_resampler'] + if getattr(self.args, "use_im_start_end", False): + keys_to_match.extend(['embed_tokens', 'embed_in']) + + weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match) + + if self.args.local_rank == 0 or self.args.local_rank == -1: + self.model.config.save_pretrained(output_dir) + torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin')) + else: + super(BunnyTrainer, self)._save_checkpoint(model, trial, metrics) + + def _save(self, output_dir: Optional[str] = None, state_dict=None): + if getattr(self.args, 'tune_mm_mlp_adapter', False): + pass + else: + super(BunnyTrainer, self)._save(output_dir, state_dict) diff --git a/Unicorn_dense/bunny/train/train.py b/Unicorn_dense/bunny/train/train.py new file mode 100644 index 0000000000000000000000000000000000000000..fa78c90ca9026af5d114e8ca2ac5cc0b55e17675 --- /dev/null +++ b/Unicorn_dense/bunny/train/train.py @@ -0,0 +1,231 @@ +import os +import logging +import pathlib +from dataclasses import dataclass, field +from typing import Dict, Optional, Sequence + +import torch +import transformers +from torch.utils.data import Dataset + +from bunny.train.bunny_trainer import BunnyTrainer +from bunny import conversation as conversation_lib +# 只保留通用的和 Llama 相关的模型类 +from bunny.model import BunnyLlamaForCausalLM +# 确保使用支持 embedding_path 的修改版 data_utils +from bunny.util.data_utils import make_supervised_data_module, DataArguments + +local_rank = None + +def rank0_print(*args): + if local_rank == 0: + print(*args) + +@dataclass +class ModelArguments: + model_name_or_path: Optional[str] = field(default=None) + model_type: Optional[str] = field(default=None) + version: Optional[str] = field(default=None) + freeze_backbone: bool = field(default=False) + tune_mm_mlp_adapter: bool = field(default=False) + + # 保留定义但设为 None,防止底层 builder 报错 + vision_tower: Optional[str] = field(default=None) + unfreeze_vision_tower: bool = field(default=False) + + use_s2: bool = field(default=False) + pretrain_mm_mlp_adapter: Optional[str] = field(default=None) + mm_projector_type: Optional[str] = field(default='mlp2x_gelu') + + +@dataclass +class TrainingArguments(transformers.TrainingArguments): + cache_dir: Optional[str] = field(default=None) + optim: str = field(default="adamw_torch") + remove_unused_columns: bool = field(default=False) + freeze_mm_mlp_adapter: bool = field(default=False) + mpt_attn_impl: Optional[str] = field(default="triton") + model_max_length: int = field( + default=512, + metadata={ + "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)." + }, + ) + # 删除所有 bits, quantization, lora 参数 + mm_projector_lr: Optional[float] = None + group_by_modality_length: bool = field(default=False) + + +def maybe_zero_3(param, ignore_status=False, name=None): + from deepspeed import zero + from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus + if hasattr(param, "ds_id"): + if param.ds_status == ZeroParamStatus.NOT_AVAILABLE: + if not ignore_status: + logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}") + with zero.GatheredParameters([param]): + param = param.data.detach().cpu().clone() + else: + param = param.detach().cpu().clone() + return param + + +def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match): + to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)} + to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()} + return to_return + + +def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str): + """Collects the state dict and dump to disk.""" + # Pretrain 阶段:只保存 Projector (Adapter) + if getattr(trainer.args, "tune_mm_mlp_adapter", False): + keys_to_match = ['mm_projector'] + if getattr(trainer.args, "use_im_start_end", False): + keys_to_match.extend(['embed_tokens', 'embed_in']) + + weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match) + trainer.model.config.save_pretrained(output_dir) + + current_folder = output_dir.split('/')[-1] + parent_folder = os.path.dirname(output_dir) + if trainer.args.local_rank == 0 or trainer.args.local_rank == -1: + if current_folder.startswith('checkpoint-'): + mm_projector_folder = os.path.join(parent_folder, "mm_projector") + os.makedirs(mm_projector_folder, exist_ok=True) + torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin')) + else: + torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin')) + return + + # SFT (Full Finetune) 阶段:保存完整模型 + if trainer.deepspeed: + torch.cuda.synchronize() + trainer.save_model(output_dir) + return + + state_dict = trainer.model.state_dict() + if trainer.args.should_save: + cpu_state_dict = { + key: value.cpu() + for key, value in state_dict.items() + } + del state_dict + trainer._save(output_dir, state_dict=cpu_state_dict) # noqa + + +def train(): + global local_rank + + parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + local_rank = training_args.local_rank + + # 1. Tokenizer Setup + # 针对 Llama3-8B 的设置 + if model_args.model_type == 'llama3-8b': + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + model_max_length=training_args.model_max_length, + padding_side="right", + use_fast=True, + ) + tokenizer.eos_token_id = 128001 + tokenizer.pad_token = tokenizer.eos_token + else: + # 通用 Fallback + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + model_max_length=training_args.model_max_length, + padding_side="right", + use_fast=True, + ) + if tokenizer.unk_token is not None and tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.unk_token + + # 2. Model Loading (Only Full Precision / BF16) + if model_args.model_type == 'llama3-8b': + model = BunnyLlamaForCausalLM.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + bos_token_id=tokenizer.bos_token_id, + eos_token_id=tokenizer.eos_token_id, + torch_dtype=(torch.bfloat16 if training_args.bf16 else torch.float32) + ) + else: + raise ValueError(f"This script is cleaned for Llama3. Unknown Model Type {model_args.model_type}") + + model.config.use_cache = False + + if model_args.freeze_backbone: + model.model.requires_grad_(False) + + # Gradient Checkpointing + if training_args.gradient_checkpointing: + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + # Chat Template Setup + if model_args.version in conversation_lib.conv_templates: + conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version] + else: + conversation_lib.default_conversation = conversation_lib.conv_templates["default"] + + # 3. Initialize Vision Modules (For Unicorn this sets up the Projector) + rank0_print('Initializing Vision Modules (Projector)...') + model.get_model().initialize_vision_modules(model_args=model_args) + + # Unicorn Configs + model.config.image_aspect_ratio = data_args.image_aspect_ratio + model.config.tokenizer_padding_side = tokenizer.padding_side + model.config.tokenizer_model_max_length = tokenizer.model_max_length + + # 4. Freeze / Unfreeze Logic + # Case A: Pretrain (Tune Adapter Only) + model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter + if model_args.tune_mm_mlp_adapter: + model.requires_grad_(False) + for p in model.get_model().mm_projector.parameters(): + p.requires_grad = True + + # Case B: Explicit Freeze + model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter + if training_args.freeze_mm_mlp_adapter: + for p in model.get_model().mm_projector.parameters(): + p.requires_grad = False + + model.config.mm_projector_lr = training_args.mm_projector_lr + model.config.use_s2 = model_args.use_s2 + + # 5. Data & Training + data_module = make_supervised_data_module(tokenizer=tokenizer, + data_args=data_args) + + trainer = BunnyTrainer(model=model, + tokenizer=tokenizer, + args=training_args, + **data_module) + + if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): + trainer.train(resume_from_checkpoint=True) + else: + trainer.train() + + trainer.save_state() + + model.config.use_cache = True + + # 6. Save Model + # 只要不是 LoRA,都走这个通用保存逻辑 + safe_save_model_for_hf_trainer(trainer=trainer, + output_dir=training_args.output_dir) + + +if __name__ == "__main__": + train() \ No newline at end of file diff --git a/Unicorn_dense/bunny/util/.DS_Store b/Unicorn_dense/bunny/util/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..d0ba6a9d0c171e434647d46a80967bfe9a876335 Binary files /dev/null and b/Unicorn_dense/bunny/util/.DS_Store differ diff --git a/Unicorn_dense/bunny/util/__pycache__/data_utils.cpython-310.pyc b/Unicorn_dense/bunny/util/__pycache__/data_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..469e4b6156eafbcd3ea404e2d8b58dca0392a332 Binary files /dev/null and b/Unicorn_dense/bunny/util/__pycache__/data_utils.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/util/__pycache__/mm_utils.cpython-310.pyc b/Unicorn_dense/bunny/util/__pycache__/mm_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..288060850339c859984d8eb6d45654ec9121bb62 Binary files /dev/null and b/Unicorn_dense/bunny/util/__pycache__/mm_utils.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/util/data_utils.py b/Unicorn_dense/bunny/util/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3d25911009109ffce6a89c4c4ffe09a4a8421283 --- /dev/null +++ b/Unicorn_dense/bunny/util/data_utils.py @@ -0,0 +1,523 @@ +import os +import copy +import pickle +from dataclasses import dataclass, field +import json +from typing import Dict, Sequence, Optional +import torch +import transformers +from bunny.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN +from torch.utils.data import Dataset +from bunny import conversation as conversation_lib +from bunny.util.mm_utils import tokenizer_image_token + +import random + + +@dataclass +class DataArguments: + data_path: str = field(default=None, metadata={"help": "Path to the training data."}) + lazy_preprocess: bool = False + is_multimodal: bool = True + image_aspect_ratio: str = field(default=None) + + +def preprocess_multimodal( + sources: Sequence[str], + data_args: DataArguments +) -> Dict: + is_multimodal = data_args.is_multimodal + if not is_multimodal: + return sources + + for source in sources: + for sentence in source: + if DEFAULT_IMAGE_TOKEN in sentence['value']: + sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip() + sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value'] + sentence['value'] = sentence['value'].strip() + + replace_token = DEFAULT_IMAGE_TOKEN + + sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token) + + return sources + + +def preprocess_bunny( + sources, + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + conv = conversation_lib.default_conversation.copy() + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + # Apply prompt templates + conversations = [] + for i, source in enumerate(sources): + if roles[source[0]["from"]] != conv.roles[0]: + # Skip the first one if it is not from human + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{i}" + conv.append_message(role, sentence["value"]) + conversations.append(conv.get_prompt()) + + # Tokenize conversations + + if has_image: + input_ids = torch.stack( + [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) + else: + input_ids = tokenizer( + conversations, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ).input_ids + + targets = input_ids.clone() + + assert conv.sep_style == conversation_lib.SeparatorStyle.TWO + + # Mask targets + sep = conv.sep + conv.roles[1] + ": " + for conversation, target in zip(conversations, targets): + total_len = int(target.ne(tokenizer.pad_token_id).sum()) + + rounds = conversation.split(conv.sep2) + cur_len = 0 + end_token_cnt = 0 + + for i, rou in enumerate(rounds): + # print('<----------------------------------------------->') + # print('for111') + # print('<----------------------------------------------->') + if rou == "": + # print('<----------------------------------------------->') + # print('for222') + # print('<----------------------------------------------->') + break + + # print('<----------------------------------------------->') + # print('for444') + # print('<----------------------------------------------->') + + parts = rou.split(sep) + if len(parts) != 2: + # print('<----------------------------------------------->') + # print('for333') + # print('<----------------------------------------------->') + break + parts[0] += sep + + # print('<----------------------------------------------->') + # print('for555') + # print('<----------------------------------------------->') + + if has_image: + round_len = len(tokenizer_image_token(rou, tokenizer)) + instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1 + else: + round_len = len(tokenizer(rou).input_ids) + instruction_len = len(tokenizer(parts[0]).input_ids) - 1 + + round_len += 1 + end_token_cnt += 1 + + target[cur_len: cur_len + instruction_len] = IGNORE_INDEX + + cur_len += round_len + target[cur_len:] = IGNORE_INDEX + + if tokenizer.pad_token_id == tokenizer.eos_token_id: + cur_len -= end_token_cnt + if cur_len < tokenizer.model_max_length: + if cur_len != total_len: + target[:] = IGNORE_INDEX + print( + f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." + f" (ignored)" + ) + + return dict( + input_ids=input_ids, + labels=targets, + ) + + +def preprocess_bunny_with_bos( + sources, + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + conv = conversation_lib.default_conversation.copy() + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + # Apply prompt templates + conversations = [] + for i, source in enumerate(sources): + if roles[source[0]["from"]] != conv.roles[0]: + # Skip the first one if it is not from human + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{i}" + conv.append_message(role, sentence["value"]) + conversations.append(conv.get_prompt()) + + # Tokenize conversations + + if has_image: + input_ids = torch.stack( + [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) + else: + input_ids = tokenizer( + conversations, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ).input_ids + + targets = input_ids.clone() + + assert conv.sep_style == conversation_lib.SeparatorStyle.TWO + + # Mask targets + sep = conv.sep + conv.roles[1] + ": " + for conversation, target in zip(conversations, targets): + total_len = int(target.ne(tokenizer.pad_token_id).sum()) + + rounds = conversation.split(conv.sep2) + cur_len = 1 + end_token_cnt = 0 + target[:cur_len] = IGNORE_INDEX + + for i, rou in enumerate(rounds): + + # print('<----------------------------------------------->') + # print('111') + # print('<----------------------------------------------->') + + if rou == "": + # print('<----------------------------------------------->') + # print('222') + # print('<----------------------------------------------->') + break + + parts = rou.split(sep) + if len(parts) != 2: + break + parts[0] += sep + + if has_image: + round_len = len(tokenizer_image_token(rou, tokenizer)) + instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2 + else: + round_len = len(tokenizer(rou).input_ids) + instruction_len = len(tokenizer(parts[0]).input_ids) - 2 + + target[cur_len: cur_len + instruction_len] = IGNORE_INDEX + + end_token_cnt += 1 + cur_len += round_len + target[cur_len:] = IGNORE_INDEX + + if tokenizer.pad_token_id == tokenizer.eos_token_id: + cur_len -= end_token_cnt + if cur_len < tokenizer.model_max_length: + if cur_len != total_len: + target[:] = IGNORE_INDEX + print( + f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." + f" (ignored)" + ) + + return dict( + input_ids=input_ids, + labels=targets, + ) + + +def preprocess_plain( + sources: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer, +) -> Dict: + # add end signal and concatenate together + conversations = [] + for source in sources: + assert len(source) == 2 + assert DEFAULT_IMAGE_TOKEN in source[0]['value'] + source[0]['value'] = DEFAULT_IMAGE_TOKEN + conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep + conversations.append(conversation) + # tokenize conversations + input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations] + targets = copy.deepcopy(input_ids) + for target, source in zip(targets, sources): + tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer)) + target[:tokenized_len] = IGNORE_INDEX + + return dict(input_ids=input_ids, labels=targets) + + +def preprocess( + sources: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN: + # print('00000000000') + return preprocess_plain(sources, tokenizer) + + if conversation_lib.default_conversation.version == "bunny": + # print('11111111111') + return preprocess_bunny(sources, tokenizer, has_image=has_image) + elif conversation_lib.default_conversation.version in {"minicpm", "llama"}: + # print('22222222222') + return preprocess_bunny_with_bos(sources, tokenizer, has_image=has_image) + # temporarily fix + # Phi-3 June 2024 Update changes bos_token behavior + elif conversation_lib.default_conversation.version == "phi3": + if len(tokenizer('').input_ids) == 0: + # print('33333333333') + return preprocess_bunny(sources, tokenizer, has_image=has_image) + else: + # print('44444444444') + return preprocess_bunny_with_bos(sources, tokenizer, has_image=has_image) + + +class LazySupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__(self, data_path: str, + tokenizer: transformers.PreTrainedTokenizer, + data_args: DataArguments): + super(LazySupervisedDataset, self).__init__() + list_data_dict = json.load(open(data_path, "r")) + + print("Formatting inputs...Skip in lazy mode") + self.tokenizer = tokenizer + self.data_args = data_args + + # 【修改点2】:预先过滤掉没有 caption 的数据,防止 __getitem__ 返回 None 导致崩溃 + self.list_data_dict = [x for x in list_data_dict if 'caption' in x] + print(f"Loaded {len(list_data_dict)} raw samples, filtered to {len(self.list_data_dict)} valid samples (with captions).") + + # <-----------------------------------------------------> + # bunny + # <-----------------------------------------------------> + + # ours + # folder_path = '/llm-align/liuchonghan/embed_0113/exp_embedding/embedding_bunny/text_ours/ours' + + # trace + # folder_path = '/llm-align/liuchonghan/embed_0113/exp_embedding/embedding_bunny/text_trace/trace' + + # <-----------------------------------------------------> + # densefusion + # <-----------------------------------------------------> + + # ours + # folder_path = '/llm-align/liuchonghan/embed_0113/exp_embedding/embedding_densefusion/text_ours/ours' + + # trace + folder_path = '/llm-align/liuchonghan/embed_0113/exp_embedding/embedding_densefusion/text_trace/trace' + + print('<----------------------------------------------------->') + print(folder_path) + print('<----------------------------------------------------->') + + # 【修改点1】:使用 Dictionary (Hash Map) 代替 List,实现 O(1) 查找 + self.embeddings = {} + + if os.path.exists(folder_path): + print(f"Loading embeddings from {folder_path}...") + for filename in os.listdir(folder_path): + if filename.endswith('.pkl'): + file_path = os.path.join(folder_path, filename) + try: + with open(file_path, 'rb') as f: + batch_data = pickle.load(f) + # 建立 id -> embed 的映射 + for item in batch_data: + self.embeddings[str(item.get('id'))] = item.get('embed') + except Exception as e: + print(f"Error loading {filename}: {e}") + print(f"Loaded {len(self.embeddings)} embeddings.") + else: + print(f"ERROR: Folder path {folder_path} does not exist.") + + + def __len__(self): + return len(self.list_data_dict) + + @property + def lengths(self): + length_list = [] + for sample in self.list_data_dict: + img_tokens = 128 if 'image' in sample else 0 + length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens) + return length_list + + @property + def modality_lengths(self): + length_list = [] + for sample in self.list_data_dict: + cur_len = sum(len(conv['value'].split()) for conv in sample['conversations']) + cur_len = cur_len if 'image' in sample else -cur_len + length_list.append(cur_len) + return length_list + + def __getitem__(self, i): + + # sources + # { + # "id": "0010278167", + # "caption": ***, + # "conversations": [ + # { + # "from": "human", + # "value": "\nWhat is this?" + # }, + # { + # "from": "gpt", + # "value": "***" + # } + # ] + # } + + # caption 有两个作用: + # 1. 替代 image + # 2. 作为 pretrain 阶段 imaginary image 的 caption 训练数据 + sources = self.list_data_dict[i] + + # print('<------------------------------------->') + # print('sources1: \n') + # print(sources) + # print('<------------------------------------->') + + # 【修改点2】:由于在 __init__ 已经过滤,这里一定有 caption,无需 else return None + # 作用 2:作为 pretrain 阶段 imaginary image 的 caption 训练数据 TODO + sources_processed = preprocess_multimodal( + copy.deepcopy([e["conversations"] for e in [sources]]), self.data_args) + + # print('<------------------------------------->') + # print('sources2: \n') + # print(sources) + # print('<------------------------------------->') + + # 由于数据格式一致,所以无需修改 preprocess + data_dict = preprocess( + sources_processed, # 注意这里使用处理后的 sources_processed + self.tokenizer, + has_image=True # 既然经过过滤,则认为包含图像信息 + ) + + if isinstance(i, int): + data_dict = dict(input_ids=data_dict["input_ids"][0], + labels=data_dict["labels"][0]) + + + # print('<------------------------------------->') + # print('self.list_data_dict[i]: \n') + # print(self.list_data_dict[i]) + # print('<------------------------------------->') + + # 【修改点1】:使用字典 O(1) 查找 embedding + sid = str(sources.get('id')) + if sid in self.embeddings: + # print('IN THERE!') + data_dict['embed'] = self.embeddings[sid] + + # print('<------------------------------------->') + # print('data_dict2: \n') + # print(data_dict) + # print('<------------------------------------->') + + # dict: + # { + # input_ids, + # labels, + # embed + # } + + return data_dict + + +@dataclass +class DataCollatorForSupervisedDataset(object): + """Collate examples for supervised fine-tuning.""" + + tokenizer: transformers.PreTrainedTokenizer + + def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: + input_ids, labels = tuple([instance[key] for instance in instances] + for key in ("input_ids", "labels")) + + if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id: + for input_id in input_ids: + input_id[input_id == self.tokenizer.eos_token_id] = -300 + + input_ids = torch.nn.utils.rnn.pad_sequence( + input_ids, + batch_first=True, + padding_value=self.tokenizer.pad_token_id) + + labels = torch.nn.utils.rnn.pad_sequence( + labels, + batch_first=True, + padding_value=IGNORE_INDEX) + + input_ids = input_ids[:, :self.tokenizer.model_max_length] + + attention_mask = input_ids.ne(self.tokenizer.pad_token_id) + + labels = labels[:, :self.tokenizer.model_max_length] + + if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id: + for input_id in input_ids: + input_id[input_id == -300] = self.tokenizer.eos_token_id + + batch = dict( + input_ids=input_ids, + labels=labels, + attention_mask=attention_mask, + ) + + # print('') + # print(instances) + # print('<---------------------------------------------------->') + + if 'embed' in instances[0]: + + embeds = [instance['embed'] for instance in instances] + embeds_tensor = torch.stack([torch.from_numpy(embed) for embed in embeds]) + + # print('') + # print(embeds_tensor) + # print('') + + # 可选:将数据类型转换为浮点数(如果尚未) + # embeds_tensor = embeds_tensor.float() + batch['embeds'] = embeds_tensor + + return batch + +def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, + data_args) -> Dict: + """Make dataset and collator for supervised fine-tuning.""" + train_dataset = LazySupervisedDataset(tokenizer=tokenizer, + data_path=data_args.data_path, + data_args=data_args) + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) + return dict(train_dataset=train_dataset, + eval_dataset=None, + data_collator=data_collator) diff --git a/Unicorn_dense/bunny/util/mm_utils.py b/Unicorn_dense/bunny/util/mm_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5c3d83fc40ea95caa39a66f8f6580468dc8963e1 --- /dev/null +++ b/Unicorn_dense/bunny/util/mm_utils.py @@ -0,0 +1,120 @@ +import base64 +import torch + +from PIL import Image +from io import BytesIO +from transformers import StoppingCriteria + +from bunny.constants import IMAGE_TOKEN_INDEX + + +def load_image_from_base64(image): + return Image.open(BytesIO(base64.b64decode(image))) + + +def expand2square(pil_img, background_color): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + + +def process_images(images, image_processor, model_cfg): + image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None) + new_images = [] + if image_aspect_ratio == 'pad': + for image in images: + image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean)) + image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + new_images.append(image) + else: + return image_processor(images, return_tensors='pt')['pixel_values'] + if all(x.shape == new_images[0].shape for x in new_images): + new_images = torch.stack(new_images, dim=0) + return new_images + + +def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None): + prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')] + + def insert_separator(X, sep): + return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1] + + input_ids = [] + + # 处理第一个块,保留 BOS(如果有) + if len(prompt_chunks) > 0: + input_ids.extend(prompt_chunks[0]) + + # 准备分隔符(图片 Token) + # 注意:这里不需要像原代码那样根据 offset 动态调整分隔符长度,Unicorn 只需要一个 token + image_token_seq = [image_token_index] + + # 处理后续块 + # 我们跳过第一个块(因为已经加了),然后插入 图片+后续块 + for i in range(1, len(prompt_chunks)): + input_ids.extend(image_token_seq) # 插入 token + + chunk = prompt_chunks[i] + # 关键修复:检查当前这个块是否以 BOS 开头,如果是才去掉 + if len(chunk) > 0 and chunk[0] == tokenizer.bos_token_id: + input_ids.extend(chunk[1:]) + else: + input_ids.extend(chunk) + + if return_tensors is not None: + if return_tensors == 'pt': + return torch.tensor(input_ids, dtype=torch.long) + raise ValueError(f'Unsupported tensor type: {return_tensors}') + return input_ids + + +def get_model_name_from_path(model_path): + model_path = model_path.strip("/") + model_paths = model_path.split("/") + if model_paths[-1].startswith('checkpoint-'): + return model_paths[-2] + "_" + model_paths[-1] + else: + return model_paths[-1] + + +class KeywordsStoppingCriteria(StoppingCriteria): + def __init__(self, keywords, tokenizer, input_ids): + self.keywords = keywords + self.keyword_ids = [] + self.max_keyword_len = 0 + for keyword in keywords: + cur_keyword_ids = tokenizer(keyword).input_ids + if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id: + cur_keyword_ids = cur_keyword_ids[1:] + if len(cur_keyword_ids) > self.max_keyword_len: + self.max_keyword_len = len(cur_keyword_ids) + self.keyword_ids.append(torch.tensor(cur_keyword_ids)) + self.tokenizer = tokenizer + self.start_len = input_ids.shape[1] + + def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: + offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len) + self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids] + for keyword_id in self.keyword_ids: + truncated_output_ids = output_ids[0, -keyword_id.shape[0]:] + if torch.equal(truncated_output_ids, keyword_id): + return True + outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0] + for keyword in self.keywords: + if keyword in outputs: + return True + return False + + def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: + outputs = [] + for i in range(output_ids.shape[0]): + outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores)) + return all(outputs) diff --git a/Unicorn_dense/bunny/util/s2wrapper/__init__.py b/Unicorn_dense/bunny/util/s2wrapper/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8632e27fd27fde54860dc07dac543d2c2d0334cb --- /dev/null +++ b/Unicorn_dense/bunny/util/s2wrapper/__init__.py @@ -0,0 +1,2 @@ +from .core import * +from .utils import * \ No newline at end of file diff --git a/Unicorn_dense/bunny/util/s2wrapper/__pycache__/__init__.cpython-310.pyc b/Unicorn_dense/bunny/util/s2wrapper/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..14cfdbe1daed9541bd4b0257fa52ae08f12c980c Binary files /dev/null and b/Unicorn_dense/bunny/util/s2wrapper/__pycache__/__init__.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/util/s2wrapper/__pycache__/core.cpython-310.pyc b/Unicorn_dense/bunny/util/s2wrapper/__pycache__/core.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48f0caf8eea0bf971d7627a252cecd00dd301733 Binary files /dev/null and b/Unicorn_dense/bunny/util/s2wrapper/__pycache__/core.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/util/s2wrapper/__pycache__/utils.cpython-310.pyc b/Unicorn_dense/bunny/util/s2wrapper/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7dd51088e7998461a5e108460004a010b307fa1e Binary files /dev/null and b/Unicorn_dense/bunny/util/s2wrapper/__pycache__/utils.cpython-310.pyc differ diff --git a/Unicorn_dense/bunny/util/s2wrapper/core.py b/Unicorn_dense/bunny/util/s2wrapper/core.py new file mode 100644 index 0000000000000000000000000000000000000000..d40710d7c836866f60091804657fe14bf9b1251f --- /dev/null +++ b/Unicorn_dense/bunny/util/s2wrapper/core.py @@ -0,0 +1,62 @@ +# ------------------------------------------------------------------------------------------ +# Copyright (c) 2024 Baifeng Shi. +# All rights reserved. +# +# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. +# ------------------------------------------------------------------------------------------ + +import math +import torch +import torch.nn.functional as F +from einops import rearrange +from .utils import split_chessboard, merge_chessboard, batched_forward + +def forward(model, input, scales=None, img_sizes=None, max_split_size=None, resize_output_to_idx=0, num_prefix_token=0, + output_shape='bnc', split_forward=False): + + assert input.dim() == 4, "Input image must be in the shape of BxCxHxW." + assert input.shape[2] == input.shape[3], "Currently only square images are supported." + assert output_shape in ['bnc', 'bchw'], "Output shape should be either BxNxC (e.g., ViT) or BxCxHxW (e.g., ConvNet)." + assert output_shape == 'bnc' or num_prefix_token == 0, "For ConvNet there shouldn't be any prefix token." + + b, c, input_size, _ = input.shape + + # image size for each scale + assert scales is not None or img_sizes is not None, "Please assign either scales or img_sizes." + img_sizes = img_sizes or [int(input_size * scale) for scale in scales] + + # prepare multiscale inputs + max_split_size = max_split_size or input_size # The maximum size of each split of image. Set as the input size by default + num_splits = [math.ceil(size / max_split_size) for size in img_sizes] # number of splits each scale + input_multiscale = [] + for size, num_split in zip(img_sizes, num_splits): + x = F.interpolate(input.to(torch.float32), size=size, mode='bicubic').to(input.dtype) + x = split_chessboard(x, num_split=num_split) + input_multiscale.append(x) + + # run feedforward on each scale + outs_multiscale = [batched_forward(model, x, b) if split_forward else model(x) for x in input_multiscale] + if num_prefix_token > 0: + outs_prefix_multiscale = [out[:, :num_prefix_token] for out in outs_multiscale] + outs_multiscale = [out[:, num_prefix_token:] for out in outs_multiscale] + if output_shape == 'bnc': + outs_multiscale = [rearrange(out, 'b (h w) c -> b c h w', h=int(out.shape[1] ** 0.5), w=int(out.shape[1] ** 0.5)) + for out in outs_multiscale] + + # merge outputs of different splits for each scale separately + outs_multiscale = [merge_chessboard(out, num_split=num_split) for num_split, out in zip(num_splits, outs_multiscale)] + + # interpolate outputs from different scales and concat together + output_size = outs_multiscale[resize_output_to_idx].shape[-2] + out = torch.cat([F.interpolate(outs_multiscale[i].to(torch.float32), size=output_size, + mode='area').to(outs_multiscale[i].dtype) + for i in range(len(outs_multiscale))], dim=1) + if output_shape == 'bnc': + out = rearrange(out, 'b c h w -> b (h w) c') + if num_prefix_token > 0: + # take the mean of prefix tokens from different splits for each scale + outs_prefix_multiscale = [torch.stack(out.split(b, dim=0), dim=0).mean(dim=0) for out in outs_prefix_multiscale] + out_prefix_multiscale = torch.cat(outs_prefix_multiscale, dim=-1) + out = torch.cat([out_prefix_multiscale, out], dim=1) + + return out diff --git a/Unicorn_dense/bunny/util/s2wrapper/utils.py b/Unicorn_dense/bunny/util/s2wrapper/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f69c8bd94b2f07c88aa8753d9485c2dae8fb3f6f --- /dev/null +++ b/Unicorn_dense/bunny/util/s2wrapper/utils.py @@ -0,0 +1,41 @@ +# ------------------------------------------------------------------------------------------ +# Copyright (c) 2024 Baifeng Shi. +# All rights reserved. +# +# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. +# ------------------------------------------------------------------------------------------ + +import torch + +def split_chessboard(x, num_split): + """ + x: b * c * h * w + Deividing x into num_split**2 sub-squares, and concatenate all the sub-squares on the batch dimension + """ + B, C, H, W = x.shape + assert H % num_split == 0 and W % num_split == 0 + h, w = H // num_split, W // num_split + x_split = torch.cat([x[:, :, i*h:(i+1)*h, j*w:(j+1)*w] for i in range(num_split) for j in range(num_split)], dim=0) + return x_split + +def merge_chessboard(x, num_split): + """ + x: b * c * h * w + Assuming x contains num_split**2 sub-squares concatenated along batch dimension, merge the sub-squares back to the original whole square. + (inverse of split_chessboard) + """ + B, C, H, W = x.shape + assert B % (num_split**2) == 0 + b = B // (num_split**2) + x_merge = torch.cat([torch.cat([x[(i*num_split + j)*b:(i*num_split + j + 1)*b] for j in range(num_split)], dim=-1) + for i in range(num_split)], dim=-2) + return x_merge + +def batched_forward(model, x, batch_size=-1): + if batch_size == -1: + return model(x) + else: + x_batched = x.split(batch_size) + outs = [model(x) for x in x_batched] + return torch.cat(outs, dim=0) + diff --git a/Unicorn_dense/bunny/util/utils.py b/Unicorn_dense/bunny/util/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b42887852bb27671b3cf023649245a2ad7cd31e1 --- /dev/null +++ b/Unicorn_dense/bunny/util/utils.py @@ -0,0 +1,124 @@ +import logging +import logging.handlers +import os +import sys + +from bunny.constants import LOGDIR + +server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" +moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN." + +handler = None + + +def disable_torch_init(): + """ + Disable the redundant torch default initialization to accelerate model creation. + """ + import torch + setattr(torch.nn.Linear, "reset_parameters", lambda self: None) + setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) + + +def build_logger(logger_name, logger_filename): + global handler + + formatter = logging.Formatter( + fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + # Set the format of root handlers + if not logging.getLogger().handlers: + logging.basicConfig(level=logging.INFO) + logging.getLogger().handlers[0].setFormatter(formatter) + + # Redirect stdout and stderr to loggers + stdout_logger = logging.getLogger("stdout") + stdout_logger.setLevel(logging.INFO) + sl = StreamToLogger(stdout_logger, logging.INFO) + sys.stdout = sl + + stderr_logger = logging.getLogger("stderr") + stderr_logger.setLevel(logging.ERROR) + sl = StreamToLogger(stderr_logger, logging.ERROR) + sys.stderr = sl + + # Get logger + logger = logging.getLogger(logger_name) + logger.setLevel(logging.INFO) + + # Add a file handler for all loggers + if handler is None: + os.makedirs(LOGDIR, exist_ok=True) + filename = os.path.join(LOGDIR, logger_filename) + handler = logging.handlers.TimedRotatingFileHandler( + filename, when='D', utc=True, encoding='UTF-8') + handler.setFormatter(formatter) + + for name, item in logging.root.manager.loggerDict.items(): + if isinstance(item, logging.Logger): + item.addHandler(handler) + + return logger + + +class StreamToLogger(object): + """ + Fake file-like stream object that redirects writes to a logger instance. + """ + + def __init__(self, logger, log_level=logging.INFO): + self.terminal = sys.stdout + self.logger = logger + self.log_level = log_level + self.linebuf = '' + + def __getattr__(self, attr): + return getattr(self.terminal, attr) + + def write(self, buf): + temp_linebuf = self.linebuf + buf + self.linebuf = '' + for line in temp_linebuf.splitlines(True): + # From the io.TextIOWrapper docs: + # On output, if newline is None, any '\n' characters written + # are translated to the system default line separator. + # By default sys.stdout.write() expects '\n' newlines and then + # translates them so this is still cross platform. + if line[-1] == '\n': + self.logger.log(self.log_level, line.rstrip()) + else: + self.linebuf += line + + def flush(self): + if self.linebuf != '': + self.logger.log(self.log_level, self.linebuf.rstrip()) + self.linebuf = '' + + +def violates_moderation(text): + """ + Check whether the text violates OpenAI moderation API. + """ + url = "https://api.openai.com/v1/moderations" + headers = {"Content-Type": "application/json", + "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]} + text = text.replace("\n", "") + data = "{" + '"input": ' + f'"{text}"' + "}" + data = data.encode("utf-8") + try: + ret = requests.post(url, headers=headers, data=data, timeout=5) + flagged = ret.json()["results"][0]["flagged"] + except requests.exceptions.RequestException as e: + flagged = False + except KeyError as e: + flagged = False + + return flagged + + +def pretty_print_semaphore(semaphore): + if semaphore is None: + return "None" + return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" diff --git a/Unicorn_dense/data/embed_mean.py b/Unicorn_dense/data/embed_mean.py new file mode 100644 index 0000000000000000000000000000000000000000..95fd7db284eabdddd920d5f837d63bf65299a431 --- /dev/null +++ b/Unicorn_dense/data/embed_mean.py @@ -0,0 +1,124 @@ +import pickle +import torch +import os +import logging +import numpy as np + +def setup_logging(): + + logging.basicConfig( + level=logging.WARNING, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler("compute_text_mean.log"), + logging.StreamHandler() + ] + ) + +def main(): + setup_logging() + + TEXT_EMBED_MEAN = "/data/xmyu/data/embeddings/pkl/text_embed_mean_shift_512_47.pkl" + input_files = [f"/data/xmyu/data/embeddings/pkl/captions_512_47/caption_embeddings_{i}.pkl" for i in range(1, 8)] + output_dir = "/data/xmyu/data/embeddings/pkl/captions_512_47_mean_shift/" + + if not os.path.exists(output_dir): + try: + os.makedirs(output_dir) + print(f"Output directory created: {output_dir}") + except Exception as e: + logging.error(f"Failed to create output directory {output_dir}: {e}") + return + + text_mean = torch.zeros(1, 1280) + + total_captions = 0 + + for file_path in input_files: + if not os.path.exists(file_path): + logging.warning(f"File does not exist: {file_path}") + continue + + print(f"Loading data from {file_path}") + with open(file_path, "rb") as f: + try: + data = pickle.load(f) + except Exception as e: + logging.warning(f"Failed to load file {file_path}: {e}") + continue + + for item in data: + embed = item.get("embed") + if embed is None: + logging.warning(f"Missing 'embed' key in item: {item}") + continue + try: + cap_embed = torch.from_numpy(embed).float() + except Exception as e: + logging.warning(f"Failed to convert 'embed' to tensor: {e}") + continue + text_mean += cap_embed.unsqueeze(0) + total_captions += 1 + + if total_captions == 0: + logging.error("No valid text embeddings found for calculation.") + return + + text_mean = text_mean / total_captions + + try: + with open(TEXT_EMBED_MEAN, "wb") as f: + pickle.dump(text_mean, f) + print(f"Text embedding mean saved to {TEXT_EMBED_MEAN}") + print(f"Processed a total of {total_captions} captions.") + except Exception as e: + logging.error(f"Failed to save mean to file {TEXT_EMBED_MEAN}: {e}") + return + + text_mean_np = text_mean.numpy().squeeze(0) + + for file_path in input_files: + if not os.path.exists(file_path): + logging.warning(f"File does not exist: {file_path}") + continue + + print(f"Processing and saving data from {file_path}") + with open(file_path, "rb") as f: + try: + data = pickle.load(f) + except Exception as e: + logging.warning(f"Failed to load file {file_path} for normalization: {e}") + continue + + modified = False + for item in data: + embed = item.get("embed") + if embed is None: + logging.warning(f"Missing 'embed' key in item: {item}") + continue + try: + embed = np.array(embed) + if embed.shape != text_mean_np.shape: + logging.warning(f"'embed' shape {embed.shape} does not match mean shape {text_mean_np.shape}, skipping item.") + continue + embed_mean_shift = embed - text_mean_np + item['embed'] = embed_mean_shift + modified = True + except Exception as e: + logging.warning(f"Failed to normalize 'embed': {e}") + continue + + if modified: + file_name = os.path.basename(file_path) + new_file_path = os.path.join(output_dir, file_name) + try: + with open(new_file_path, "wb") as f: + pickle.dump(data, f) + print(f"Normalized data saved to {new_file_path}") + except Exception as e: + logging.warning(f"Failed to save normalized data to file {new_file_path}: {e}") + else: + logging.warning(f"No 'embed' data to modify in file {file_path}.") + +if __name__ == "__main__": + main() diff --git a/Unicorn_dense/inference.py b/Unicorn_dense/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..5e7c32e3c9786888764e3b6d66a263ff04908172 --- /dev/null +++ b/Unicorn_dense/inference.py @@ -0,0 +1,59 @@ +import torch +import transformers +from transformers import AutoModelForCausalLM, AutoTokenizer +from PIL import Image +import warnings +import transformers +import pickle + +# disable some warnings +transformers.logging.set_verbosity_error() +transformers.logging.disable_progress_bar() +warnings.filterwarnings('ignore') + +# set device +device = 'cuda' # or cpu +torch.set_default_device(device) + +import sys + +# 把 /data/xmyu/Bunny_text/ 加进 sys.path,以便后续 import +sys.path.insert(0, "/data/xmyu/Bunny_text") +from bunny.model.language_model.bunny_llama import BunnyLlamaConfig, BunnyLlamaForCausalLM + + +# create model +model = AutoModelForCausalLM.from_pretrained( + '/data/xmyu/Bunny_text/checkpoints-llama3-8b/bunny-llama3-8b/checkpoint-7364', + torch_dtype=torch.float16, # float32 for cpu + trust_remote_code=True).to("cuda") + +tokenizer = AutoTokenizer.from_pretrained( + '/data/xmyu/Bunny_text/checkpoints-llama3-8b/bunny-llama3-8b/checkpoint-7364', + trust_remote_code=True) + +# Is this anime style? + +# text prompt +prompt = "" +text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \n{prompt} ASSISTANT:" +text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('')] +input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0).to(device) + + +pkl_file = "image_features.pkl" +with open(pkl_file, "rb") as f: + loaded_features = pickle.load(f) + +image_features = torch.from_numpy(loaded_features).to("cuda") + +# generate +output_ids = model.generate( + input_ids, + embeds=image_features, + max_new_tokens=1024, + use_cache=False, + repetition_penalty=1.0 # increase this to avoid chattering +)[0] + +print(tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()) diff --git a/Unicorn_dense/script/.DS_Store b/Unicorn_dense/script/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..3661d5c0b5865e9f10686339af2867fbb38c1a1f Binary files /dev/null and b/Unicorn_dense/script/.DS_Store differ diff --git a/Unicorn_dense/script/batch_inference.py b/Unicorn_dense/script/batch_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..441dcd7f99a836c046b12f46130a716b8eb6c4f0 --- /dev/null +++ b/Unicorn_dense/script/batch_inference.py @@ -0,0 +1,69 @@ +import torch +import transformers +from transformers import AutoModelForCausalLM, AutoTokenizer +from PIL import Image +import warnings + +# disable some warnings +transformers.logging.set_verbosity_error() +transformers.logging.disable_progress_bar() +warnings.filterwarnings('ignore') + +# set device +device = 'cuda' # or cpu +torch.set_default_device(device) + +model_name = 'BAAI/Bunny-v1_1-Llama-3-8B-V' # or 'BAAI/Bunny-Llama-3-8B-V' or 'BAAI/Bunny-v1_1-4B' or 'BAAI/Bunny-v1_0-4B' or 'BAAI/Bunny-v1_0-3B' or 'BAAI/Bunny-v1_0-3B-zh' or 'BAAI/Bunny-v1_0-2B-zh' + +# create model +model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.float16, # float32 for cpu + device_map='auto', + trust_remote_code=True) +tokenizer = AutoTokenizer.from_pretrained( + model_name, + trust_remote_code=True) + +# for batch inference +tokenizer.padding_side = "left" +tokenizer.pad_token_id = model.generation_config.pad_token_id +padding_max_length = 128 # customize for your circumstance +tokenizer.add_tokens(['']) +image_token_id = tokenizer.convert_tokens_to_ids('') + +# text prompts +prompts = [ + 'What is the astronaut holding in his hand?', + 'Why is the image funny?', + 'What is the occupation of the person in the picture?', + 'What animal is in the picture?' +] +texts = [ + f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \n{prompt} ASSISTANT:" + for prompt in prompts] +input_ids = torch.tensor( + [tokenizer(text, padding='max_length', max_length=padding_max_length).input_ids for text in texts], + dtype=torch.long).to(device) +input_ids[input_ids == image_token_id] = -200 + +# images, sample images can be found in https://huggingface.co/BAAI/Bunny-v1_1-Llama-3-8B-V/tree/main/images +image_paths = [ + 'example_1.png', + 'example_2.png', + 'example_1.png', + 'example_2.png' +] +images = [Image.open(image_path) for image_path in image_paths] +image_tensor = model.process_images(images, model.config).to(dtype=model.dtype, device=device) + +# generate +output_ids = model.generate( + input_ids, + images=image_tensor, + max_new_tokens=100, + use_cache=True, + repetition_penalty=1.0 # increase this to avoid chattering +) + +print([ans.strip() for ans in tokenizer.batch_decode(output_ids[:, input_ids.shape[1]:], skip_special_tokens=True)]) diff --git a/Unicorn_dense/script/conversion_to_GGUF.md b/Unicorn_dense/script/conversion_to_GGUF.md new file mode 100644 index 0000000000000000000000000000000000000000..1672387fc0ba8621c6c8e938a550033cefc02ca6 --- /dev/null +++ b/Unicorn_dense/script/conversion_to_GGUF.md @@ -0,0 +1,286 @@ +# Conversion to GGUF + +* Firstly, prepare a python environment and install the following dependencies: + + ```shell + pip install torch transformers gguf sentencepiece + ``` + +* And then install `llama.cpp`. + +* Then, edit `llama.cpp/examples/llava/convert-image-encoder-to-gguf.py` to support SigLIP: + + * when importing packages, chage + + ```python + from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel + ``` + + to + + ```python + from transformers import SiglipModel as CLIPModel + from transformers import SiglipProcessor as CLIPProcessor + from transformers import SiglipVisionModel as CLIPVisionModel + ``` + + +* Then, edit `llama.cpp/convert-hf-to-gguf.py` to skip unknown parts: + + change + + ```python + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) + if new_name is None: + raise ValueError(f"Can not map tensor {name!r}") + return new_name + ``` + + to + + ```python + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) + return new_name + ``` + + change + + ```python + def write_tensors(self): + max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") + + for name, data_torch in self.get_tensors(): + ... + + for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): + data: np.ndarray = data # type hint + n_dims = len(data.shape) + ... + ``` + + to + + ```python + def write_tensors(self): + max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") + + for name, data_torch in self.get_tensors(): + ... + + for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): + if new_name is None: + continue + + data: np.ndarray = data # type hint + n_dims = len(data.shape) + ... + ``` + + + +## converting [Bunny-Llama-3-8B-V](https://huggingface.co/BAAI/Bunny-Llama-3-8B-V) + +1. `cd llama.cpp/examples/llava` + +2. Download the weights and put under `./` + +3. Extract the weights of vision tower and multimodel projector: + + ```shell + python llava-surgery-v2.py -C -m Bunny-Llama-3-8B-V + ``` + + you will find a `llava.projector` and a `llava.clip` file in `Bunny-Llama-3-8B-V` + +4. Create the visual gguf model: + + * prepare files + + ```shell + cd Bunny-Llama-3-8B-V + mkdir vit + cp llava.clip vit/pytorch_model.bin + cp llava.projector vit/ + ``` + + and put [`config.json`](#appendix) under `vit/` + + * and then: + + ```shell + python ../convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 + cd .. + ``` + + you will find a `mmproj-model-f16.gguf` file in `Bunny-Llama-3-8B-V/vit` + + + +5. Convert the left language part: + + * edit `Bunny-Llama-3-8B-V/config.json`: + + change + + ```json + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + ``` + + to + + ```json + "architectures": [ + "LlamaForCausalLM" + ], + ``` + + * And then: + + ```shell + python ../../convert-hf-to-gguf.py Bunny-Llama-3-8B-V + ``` + + you will find a `ggml-model-f16.gguf` file in `Bunny-Llama-3-8B-V` + + + + + +## converting [Bunny-v1_0-4B](https://huggingface.co/BAAI/Bunny-v1_0-4B) + +1. `cd llama.cpp/examples/llava` + +2. Download the weights and put under `./` + +3. Extract the weights of vision tower and multimodel projector: + + ```shell + python llava-surgery-v2.py -C -m Bunny-v1_0-4B + ``` + + you will find a `llava.projector` and a `llava.clip` file in `Bunny-v1_0-4B` + +4. Create the visual gguf model: + + * prepare files + + ```shell + cd Bunny-v1_0-4B + mkdir vit + cp llava.clip vit/pytorch_model.bin + cp llava.projector vit/ + ``` + + and put [`config.json`](#appendix) under `vit/` + + * and then: + + ```shell + python ../convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 + cd .. + ``` + + you will find a `mmproj-model-f16.gguf` file in `Bunny-v1_0-4B/vit` + + + +5. Convert the left language part: + + * edit `Bunny-v1_0-4B/config.json`: + + change + + ```json + "architectures": [ + "BunnyPhi3ForCausalLM" + ], + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_phi3.BunnyPhi3Config", + "AutoModelForCausalLM": "modeling_bunny_phi3.BunnyPhi3ForCausalLM" + }, + ``` + + to + + ```json + "architectures": [ + "Phi3ForCausalLM" + ], + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + ``` + + * And then: + + ```shell + python ../../convert-hf-to-gguf.py Bunny-v1_0-4B + ``` + + you will find a `ggml-model-f16.gguf` file in `Bunny-v1_0-4B` + + + + +## Appendix + +`vit/config.json` + +```json +{ + "architectures": [ + "SiglipVisionModel" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "image_size": 384, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 4304, + "layer_norm_eps": 1e-6, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": 1e-05, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "model_type": "siglip_vision_model", + "num_attention_heads": 16, + "num_hidden_layers": 27, + "num_key_value_heads": 8, + "patch_size": 14, + "pretraining_tp": 1, + "projection_dim": 1152, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.40.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "vocab_size": 128256 +} +``` + + + diff --git a/Unicorn_dense/script/deepspeed/zero2.json b/Unicorn_dense/script/deepspeed/zero2.json new file mode 100644 index 0000000000000000000000000000000000000000..c95ebefe07b7d8d9fd0936a014679d07102cc270 --- /dev/null +++ b/Unicorn_dense/script/deepspeed/zero2.json @@ -0,0 +1,23 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "train_micro_batch_size_per_gpu": "auto", + "train_batch_size": "auto", + "gradient_accumulation_steps": "auto", + "zero_optimization": { + "stage": 2, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto" + } +} \ No newline at end of file diff --git a/Unicorn_dense/script/deepspeed/zero3.json b/Unicorn_dense/script/deepspeed/zero3.json new file mode 100644 index 0000000000000000000000000000000000000000..6917317af62da757ca759a92b326ddfa65b203cc --- /dev/null +++ b/Unicorn_dense/script/deepspeed/zero3.json @@ -0,0 +1,28 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "train_micro_batch_size_per_gpu": "auto", + "train_batch_size": "auto", + "gradient_accumulation_steps": "auto", + "zero_optimization": { + "stage": 3, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + } +} \ No newline at end of file diff --git a/Unicorn_dense/script/merge_lora_weights.py b/Unicorn_dense/script/merge_lora_weights.py new file mode 100644 index 0000000000000000000000000000000000000000..a87959dc3f267b924347b969c834ed5248bc681b --- /dev/null +++ b/Unicorn_dense/script/merge_lora_weights.py @@ -0,0 +1,26 @@ +import argparse +import os +from bunny.model.builder import load_pretrained_model +from bunny.util.mm_utils import get_model_name_from_path + + +def merge_lora(args): + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name, + args.model_type) + + model.save_pretrained(args.save_model_path) + tokenizer.save_pretrained(args.save_model_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, required=True) + parser.add_argument("--model-base", type=str, required=True) + parser.add_argument("--model-type", type=str, required=True) + parser.add_argument("--save-model-path", type=str, required=True) + + args = parser.parse_args() + + merge_lora(args) diff --git a/Unicorn_dense/script/train/.DS_Store b/Unicorn_dense/script/train/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..94d9071d93e740f73c3720c2fabd6bc1caab0e86 Binary files /dev/null and b/Unicorn_dense/script/train/.DS_Store differ diff --git a/Unicorn_dense/script/train/finetune_full.sh b/Unicorn_dense/script/train/finetune_full.sh new file mode 100644 index 0000000000000000000000000000000000000000..25cfb1dee5dde7a9bd0b2685a78b52a4e118e3c3 --- /dev/null +++ b/Unicorn_dense/script/train/finetune_full.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +MODEL_TYPE=llama3-8b + +PRETRAIN_DIR=bunny-$MODEL_TYPE-pretrain +OUTPUT_DIR=bunny-$MODEL_TYPE + +mkdir -p ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR + +deepspeed bunny/train/train.py \ + --deepspeed ./script/deepspeed/zero3.json \ + --model_name_or_path /data/xmyu/checkpoints/Meta-Llama-3-8B-Instruct \ + --model_type $MODEL_TYPE \ + --version llama \ + --data_path /data/xmyu/data/Synthetic_Data/Instruction.json \ + --pretrain_mm_mlp_adapter /data/xmyu/Bunny_text/checkpoints-pretrain/$PRETRAIN_DIR/checkpoint-1450/mm_projector.bin \ + --mm_projector_type mlp2x_gelu \ + --image_aspect_ratio pad \ + --group_by_modality_length False \ + --bf16 True \ + --output_dir ./checkpoints-$MODEL_TYPE/$OUTPUT_DIR \ + --num_train_epochs 1 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 2 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 2200 \ + --save_total_limit 1 \ + --learning_rate 1e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 4 \ + --lazy_preprocess True \ + --report_to wandb \ No newline at end of file diff --git a/Unicorn_dense/script/train/pretrain-bunny.sh b/Unicorn_dense/script/train/pretrain-bunny.sh new file mode 100644 index 0000000000000000000000000000000000000000..5453e3cc4f96c224742e446dc2f2083790238375 --- /dev/null +++ b/Unicorn_dense/script/train/pretrain-bunny.sh @@ -0,0 +1,44 @@ +#!/bin/bash +export PYTHONPATH=$PYTHONPATH:$(pwd) + +export NCCL_SOCKET_IFNAME=eth0 +export GLOO_SOCKET_IFNAME=eth0 +export TP_SOCKET_IFNAME=eth0 + +PYTHON_CMD="/llm-align/liuchonghan/env/envs/bunny/bin/python" +MODEL_TYPE=llama3-8b +Data_TYPE=bunny +OUTPUT_DIR=bunny-$MODEL_TYPE-pretrain + +mkdir -p /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-$Data_TYPE/$OUTPUT_DIR + +$PYTHON_CMD -m deepspeed.launcher.runner bunny/train/train.py \ + --deepspeed ./script/deepspeed/zero2.json \ + --model_name_or_path /llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct \ + --model_type $MODEL_TYPE \ + --version plain \ + --data_path /llm-align/liuchonghan/xiaomin/data/bunny/pretrain/bunny_pretrain_ours.json \ + --mm_projector_type mlp2x_gelu \ + --tune_mm_mlp_adapter True \ + --image_aspect_ratio square \ + --bf16 True \ + --output_dir /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-$Data_TYPE/$OUTPUT_DIR \ + --num_train_epochs 1 \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 100000 \ + --save_total_limit 1 \ + --learning_rate 5e-4 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 4 \ + --lazy_preprocess True \ + --report_to wandb diff --git a/Unicorn_dense/script/train/pretrain-densefusion.sh b/Unicorn_dense/script/train/pretrain-densefusion.sh new file mode 100644 index 0000000000000000000000000000000000000000..cbeb6c086287dd6d7ab9770eb3ab67c3b88b40d5 --- /dev/null +++ b/Unicorn_dense/script/train/pretrain-densefusion.sh @@ -0,0 +1,44 @@ +#!/bin/bash +export PYTHONPATH=$PYTHONPATH:$(pwd) + +export NCCL_SOCKET_IFNAME=eth0 +export GLOO_SOCKET_IFNAME=eth0 +export TP_SOCKET_IFNAME=eth0 + +PYTHON_CMD="/llm-align/liuchonghan/env/envs/bunny/bin/python" +MODEL_TYPE=llama3-8b +Data_TYPE=densefusion +OUTPUT_DIR=densefusion-$MODEL_TYPE-pretrain + +mkdir -p /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-$Data_TYPE/$OUTPUT_DIR + +$PYTHON_CMD -m deepspeed.launcher.runner bunny/train/train.py \ + --deepspeed ./script/deepspeed/zero2.json \ + --model_name_or_path /llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct \ + --model_type $MODEL_TYPE \ + --version plain \ + --data_path /llm-align/liuchonghan/xiaomin/data/densefusion/densefusion_pretrain_ours.json \ + --mm_projector_type mlp2x_gelu \ + --tune_mm_mlp_adapter True \ + --image_aspect_ratio square \ + --bf16 True \ + --output_dir /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-$Data_TYPE/$OUTPUT_DIR \ + --num_train_epochs 1 \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 100000 \ + --save_total_limit 1 \ + --learning_rate 5e-4 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 4 \ + --lazy_preprocess True \ + --report_to wandb diff --git a/Unicorn_dense/wandb/debug-cli.root.log b/Unicorn_dense/wandb/debug-cli.root.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Unicorn_dense/wandb/debug-internal.log b/Unicorn_dense/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..a9cdab117235c9cd8e5bc6322bae8919a0d4a377 --- /dev/null +++ b/Unicorn_dense/wandb/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2026-01-15T23:09:09.359519736+08:00","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2026-01-15T23:09:10.053029159+08:00","level":"INFO","msg":"stream: created new stream","id":"tzltcduc"} +{"time":"2026-01-15T23:09:10.053157747+08:00","level":"INFO","msg":"handler: started","stream_id":"tzltcduc"} +{"time":"2026-01-15T23:09:10.053614895+08:00","level":"INFO","msg":"stream: started","id":"tzltcduc"} +{"time":"2026-01-15T23:09:10.053705975+08:00","level":"INFO","msg":"writer: started","stream_id":"tzltcduc"} +{"time":"2026-01-15T23:09:10.053751795+08:00","level":"INFO","msg":"sender: started","stream_id":"tzltcduc"} +{"time":"2026-01-16T00:58:27.786570434+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/tzltcduc/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-16T01:01:02.736252069+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/tzltcduc/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-16T01:50:41.805241062+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/tzltcduc/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-16T02:31:11.489117871+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/tzltcduc/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-16T05:47:50.424144962+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/tzltcduc/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-16T08:17:31.77758427+08:00","level":"INFO","msg":"stream: closing","id":"tzltcduc"} +{"time":"2026-01-16T08:17:34.163717521+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-01-16T08:17:34.848919185+08:00","level":"INFO","msg":"handler: closed","stream_id":"tzltcduc"} +{"time":"2026-01-16T08:17:34.849133266+08:00","level":"INFO","msg":"sender: closed","stream_id":"tzltcduc"} +{"time":"2026-01-16T08:17:34.84916337+08:00","level":"INFO","msg":"stream: closed","id":"tzltcduc"} diff --git a/Unicorn_dense/wandb/debug.log b/Unicorn_dense/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..b08e57f53a0cd2ba6a32dccb3c226ce02bfb41e3 --- /dev/null +++ b/Unicorn_dense/wandb/debug.log @@ -0,0 +1,26 @@ +2026-01-15 23:09:08,904 INFO MainThread:443684 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2026-01-15 23:09:08,904 INFO MainThread:443684 [wandb_setup.py:_flush():80] Configure stats pid to 443684 +2026-01-15 23:09:08,904 INFO MainThread:443684 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2026-01-15 23:09:08,904 INFO MainThread:443684 [wandb_setup.py:_flush():80] Loading settings from /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/settings +2026-01-15 23:09:08,904 INFO MainThread:443684 [wandb_setup.py:_flush():80] Loading settings from environment variables +2026-01-15 23:09:08,904 INFO MainThread:443684 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/logs/debug.log +2026-01-15 23:09:08,905 INFO MainThread:443684 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/logs/debug-internal.log +2026-01-15 23:09:08,905 INFO MainThread:443684 [wandb_init.py:init():841] calling init triggers +2026-01-15 23:09:08,905 INFO MainThread:443684 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-01-15 23:09:08,905 INFO MainThread:443684 [wandb_init.py:init():889] starting backend +2026-01-15 23:09:09,343 INFO MainThread:443684 [wandb_init.py:init():892] sending inform_init request +2026-01-15 23:09:09,356 INFO MainThread:443684 [wandb_init.py:init():900] backend started and connected +2026-01-15 23:09:09,358 INFO MainThread:443684 [wandb_init.py:init():970] updated telemetry +2026-01-15 23:09:09,359 INFO MainThread:443684 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2026-01-15 23:09:10,667 INFO MainThread:443684 [wandb_init.py:init():1041] starting run threads in backend +2026-01-15 23:09:10,942 INFO MainThread:443684 [wandb_run.py:_console_start():2521] atexit reg +2026-01-15 23:09:10,942 INFO MainThread:443684 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2026-01-15 23:09:10,942 INFO MainThread:443684 [wandb_run.py:_redirect():2438] Wrapping output streams. +2026-01-15 23:09:10,943 INFO MainThread:443684 [wandb_run.py:_redirect():2461] Redirects installed. +2026-01-15 23:09:10,950 INFO MainThread:443684 [wandb_init.py:init():1081] run started, returning control to user process +2026-01-15 23:09:10,953 INFO MainThread:443684 [wandb_run.py:_config_callback():1396] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 8192, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 500000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '/llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct', 'transformers_version': '4.44.0', 'model_type': 'bunny-llama', 'use_mm_proj': True, 'mm_projector_type': 'mlp2x_gelu', 'mm_hidden_size': 1280, 'image_aspect_ratio': 'square', 'tokenizer_padding_side': 'right', 'tokenizer_model_max_length': 2048, 'tune_mm_mlp_adapter': True, 'freeze_mm_mlp_adapter': False, 'mm_projector_lr': None, 'use_s2': False, 'output_dir': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0005, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.03, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain/runs/Jan15_23-06-00_h-liuchonghan-rler1225-a800-a100-2nodes-m-0', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100000, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': True, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': './script/deepspeed/zero2.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'no', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'eval_use_gather_object': False, 'cache_dir': None, 'mpt_attn_impl': 'triton', 'model_max_length': 2048, 'group_by_modality_length': False} +2026-01-15 23:09:10,972 INFO MainThread:443684 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8052289536 - > +2026-01-15 23:09:10,972 INFO MainThread:443684 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 8052289536 None +2026-01-16 08:17:31,777 INFO wandb-AsyncioManager-main:443684 [service_client.py:_forward_responses():80] Reached EOF. +2026-01-16 08:17:31,777 INFO wandb-AsyncioManager-main:443684 [mailbox.py:close():137] Closing mailbox, abandoning 2 handles. diff --git a/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/files/wandb-summary.json b/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..1d476fc88692f959c7a899096787abbc21a55dbc --- /dev/null +++ b/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":0,"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/logs/debug-core.log b/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..ca6591bad50e9b17916f1b453e59b99efd836e31 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2026-01-13T22:16:17.962684788+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp3yr1jww1/port-1815633.txt","pid":1815633,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-01-13T22:16:17.963863817+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":1815633} +{"time":"2026-01-13T22:16:17.963808707+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1815633-1827091-1557386465/socket","Net":"unix"}} +{"time":"2026-01-13T22:16:18.125890514+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-01-13T22:16:18.141567552+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"z92v42uv","id":"1(@)"} +{"time":"2026-01-13T22:16:19.579149529+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"z92v42uv","id":"1(@)"} +{"time":"2026-01-13T22:16:19.892230267+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2026-01-13T22:16:19.892393013+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2026-01-13T22:16:19.892441766+08:00","level":"INFO","msg":"server is shutting down"} +{"time":"2026-01-13T22:16:19.892479344+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2026-01-13T22:16:19.892677432+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-1815633-1827091-1557386465/socket","Net":"unix"}} +{"time":"2026-01-13T22:16:20.065727075+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2026-01-13T22:16:20.065776807+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2026-01-13T22:16:20.065797761+08:00","level":"INFO","msg":"server is closed"} diff --git a/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/logs/debug-internal.log b/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..b4d5ec35bc212a9ee747a4d5eb2a3825c6e14291 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/logs/debug-internal.log @@ -0,0 +1,15 @@ +{"time":"2026-01-13T22:16:18.142175576+08:00","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2026-01-13T22:16:19.578338876+08:00","level":"INFO","msg":"stream: created new stream","id":"z92v42uv"} +{"time":"2026-01-13T22:16:19.578617708+08:00","level":"INFO","msg":"handler: started","stream_id":"z92v42uv"} +{"time":"2026-01-13T22:16:19.579115722+08:00","level":"INFO","msg":"stream: started","id":"z92v42uv"} +{"time":"2026-01-13T22:16:19.579181868+08:00","level":"INFO","msg":"writer: started","stream_id":"z92v42uv"} +{"time":"2026-01-13T22:16:19.579221472+08:00","level":"INFO","msg":"sender: started","stream_id":"z92v42uv"} +{"time":"2026-01-13T22:16:19.873240213+08:00","level":"ERROR","msg":"HTTP error","status":401,"method":"POST","url":"https://api.wandb.ai/graphql"} +{"time":"2026-01-13T22:16:19.87339772+08:00","level":"ERROR","msg":"runupserter: failed to init run","error":"returned error 401: {\"data\":{\"upsertBucket\":null},\"errors\":[{\"message\":\"user is not logged in\",\"path\":[\"upsertBucket\"],\"extensions\":{\"code\":\"PERMISSION_ERROR\"}}]}"} +{"time":"2026-01-13T22:16:19.892353549+08:00","level":"INFO","msg":"stream: closing","id":"z92v42uv"} +{"time":"2026-01-13T22:16:20.011837057+08:00","level":"ERROR","msg":"sender: uploadConfigFile: runupserter: run not yet initialized"} +{"time":"2026-01-13T22:16:20.062252431+08:00","level":"ERROR","msg":"runfiles: runupserter: run not yet initialized"} +{"time":"2026-01-13T22:16:20.062969767+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-01-13T22:16:20.063020337+08:00","level":"INFO","msg":"handler: closed","stream_id":"z92v42uv"} +{"time":"2026-01-13T22:16:20.063117216+08:00","level":"INFO","msg":"sender: closed","stream_id":"z92v42uv"} +{"time":"2026-01-13T22:16:20.06314285+08:00","level":"INFO","msg":"stream: closed","id":"z92v42uv"} diff --git a/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/logs/debug.log b/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..6eec9e4766d9fd98834a22deb0610feac3a09924 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/logs/debug.log @@ -0,0 +1,15 @@ +2026-01-13 22:16:16,920 INFO MainThread:1815633 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2026-01-13 22:16:16,920 INFO MainThread:1815633 [wandb_setup.py:_flush():80] Configure stats pid to 1815633 +2026-01-13 22:16:16,920 INFO MainThread:1815633 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2026-01-13 22:16:16,921 INFO MainThread:1815633 [wandb_setup.py:_flush():80] Loading settings from /llm-align/liuchonghan/xiaomin/Unicorn/wandb/settings +2026-01-13 22:16:16,921 INFO MainThread:1815633 [wandb_setup.py:_flush():80] Loading settings from environment variables +2026-01-13 22:16:16,921 INFO MainThread:1815633 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /llm-align/liuchonghan/xiaomin/Unicorn/wandb/run-20260113_221616-z92v42uv/logs/debug.log +2026-01-13 22:16:16,921 INFO MainThread:1815633 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /llm-align/liuchonghan/xiaomin/Unicorn/wandb/run-20260113_221616-z92v42uv/logs/debug-internal.log +2026-01-13 22:16:16,921 INFO MainThread:1815633 [wandb_init.py:init():841] calling init triggers +2026-01-13 22:16:16,921 INFO MainThread:1815633 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-01-13 22:16:16,921 INFO MainThread:1815633 [wandb_init.py:init():889] starting backend +2026-01-13 22:16:18,126 INFO MainThread:1815633 [wandb_init.py:init():892] sending inform_init request +2026-01-13 22:16:18,138 INFO MainThread:1815633 [wandb_init.py:init():900] backend started and connected +2026-01-13 22:16:18,140 INFO MainThread:1815633 [wandb_init.py:init():970] updated telemetry +2026-01-13 22:16:18,141 INFO MainThread:1815633 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout diff --git a/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/run-z92v42uv.wandb b/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/run-z92v42uv.wandb new file mode 100644 index 0000000000000000000000000000000000000000..de63183409074d3f5ca888a07e5caab6eb1637a2 Binary files /dev/null and b/Unicorn_dense/wandb/run-20260113_221616-z92v42uv/run-z92v42uv.wandb differ diff --git a/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/files/wandb-summary.json b/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..b0a620d0c1047a4dd8a400939b6da246ed8063a7 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0},"_runtime":0} \ No newline at end of file diff --git a/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/logs/debug-core.log b/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..2d493412ace1b2f03a64bebbb31387e45ebc80a7 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2026-01-13T22:19:32.700257007+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpx2yylatn/port-1827759.txt","pid":1827759,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-01-13T22:19:32.701329938+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":1827759} +{"time":"2026-01-13T22:19:32.70128789+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1827759-1839217-2342298697/socket","Net":"unix"}} +{"time":"2026-01-13T22:19:32.866259715+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-01-13T22:19:32.879698204+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"2mrjbwog","id":"1(@)"} +{"time":"2026-01-13T22:19:33.578570373+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"2mrjbwog","id":"1(@)"} +{"time":"2026-01-13T22:19:33.882909043+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2026-01-13T22:19:33.883055863+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2026-01-13T22:19:33.883117888+08:00","level":"INFO","msg":"server is shutting down"} +{"time":"2026-01-13T22:19:33.883133045+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2026-01-13T22:19:33.883336791+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-1827759-1839217-2342298697/socket","Net":"unix"}} +{"time":"2026-01-13T22:19:33.93862069+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2026-01-13T22:19:33.938671818+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2026-01-13T22:19:33.938690886+08:00","level":"INFO","msg":"server is closed"} diff --git a/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/logs/debug-internal.log b/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..4ef18595dee75c15a00ed6e12388a5f1e7f8a285 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/logs/debug-internal.log @@ -0,0 +1,15 @@ +{"time":"2026-01-13T22:19:32.880267672+08:00","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2026-01-13T22:19:33.577878113+08:00","level":"INFO","msg":"stream: created new stream","id":"2mrjbwog"} +{"time":"2026-01-13T22:19:33.578075013+08:00","level":"INFO","msg":"handler: started","stream_id":"2mrjbwog"} +{"time":"2026-01-13T22:19:33.57853978+08:00","level":"INFO","msg":"stream: started","id":"2mrjbwog"} +{"time":"2026-01-13T22:19:33.578623248+08:00","level":"INFO","msg":"writer: started","stream_id":"2mrjbwog"} +{"time":"2026-01-13T22:19:33.578652305+08:00","level":"INFO","msg":"sender: started","stream_id":"2mrjbwog"} +{"time":"2026-01-13T22:19:33.864921024+08:00","level":"ERROR","msg":"HTTP error","status":401,"method":"POST","url":"https://api.wandb.ai/graphql"} +{"time":"2026-01-13T22:19:33.865117646+08:00","level":"ERROR","msg":"runupserter: failed to init run","error":"returned error 401: {\"data\":{\"upsertBucket\":null},\"errors\":[{\"message\":\"user is not logged in\",\"path\":[\"upsertBucket\"],\"extensions\":{\"code\":\"PERMISSION_ERROR\"}}]}"} +{"time":"2026-01-13T22:19:33.883060263+08:00","level":"INFO","msg":"stream: closing","id":"2mrjbwog"} +{"time":"2026-01-13T22:19:33.884066839+08:00","level":"ERROR","msg":"sender: uploadConfigFile: runupserter: run not yet initialized"} +{"time":"2026-01-13T22:19:33.9353046+08:00","level":"ERROR","msg":"runfiles: runupserter: run not yet initialized"} +{"time":"2026-01-13T22:19:33.936120142+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-01-13T22:19:33.936163168+08:00","level":"INFO","msg":"handler: closed","stream_id":"2mrjbwog"} +{"time":"2026-01-13T22:19:33.936266752+08:00","level":"INFO","msg":"sender: closed","stream_id":"2mrjbwog"} +{"time":"2026-01-13T22:19:33.936282817+08:00","level":"INFO","msg":"stream: closed","id":"2mrjbwog"} diff --git a/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/logs/debug.log b/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..22922cc80ba7e1e8533162194112fefbfe3bff51 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/logs/debug.log @@ -0,0 +1,15 @@ +2026-01-13 22:19:32,573 INFO MainThread:1827759 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2026-01-13 22:19:32,574 INFO MainThread:1827759 [wandb_setup.py:_flush():80] Configure stats pid to 1827759 +2026-01-13 22:19:32,574 INFO MainThread:1827759 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2026-01-13 22:19:32,574 INFO MainThread:1827759 [wandb_setup.py:_flush():80] Loading settings from /llm-align/liuchonghan/xiaomin/Unicorn/wandb/settings +2026-01-13 22:19:32,574 INFO MainThread:1827759 [wandb_setup.py:_flush():80] Loading settings from environment variables +2026-01-13 22:19:32,574 INFO MainThread:1827759 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /llm-align/liuchonghan/xiaomin/Unicorn/wandb/run-20260113_221932-2mrjbwog/logs/debug.log +2026-01-13 22:19:32,574 INFO MainThread:1827759 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /llm-align/liuchonghan/xiaomin/Unicorn/wandb/run-20260113_221932-2mrjbwog/logs/debug-internal.log +2026-01-13 22:19:32,574 INFO MainThread:1827759 [wandb_init.py:init():841] calling init triggers +2026-01-13 22:19:32,574 INFO MainThread:1827759 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-01-13 22:19:32,574 INFO MainThread:1827759 [wandb_init.py:init():889] starting backend +2026-01-13 22:19:32,866 INFO MainThread:1827759 [wandb_init.py:init():892] sending inform_init request +2026-01-13 22:19:32,876 INFO MainThread:1827759 [wandb_init.py:init():900] backend started and connected +2026-01-13 22:19:32,878 INFO MainThread:1827759 [wandb_init.py:init():970] updated telemetry +2026-01-13 22:19:32,879 INFO MainThread:1827759 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout diff --git a/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/run-2mrjbwog.wandb b/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/run-2mrjbwog.wandb new file mode 100644 index 0000000000000000000000000000000000000000..24df7cc7ccb44261231eb061edf6b02948cdb892 Binary files /dev/null and b/Unicorn_dense/wandb/run-20260113_221932-2mrjbwog/run-2mrjbwog.wandb differ diff --git a/Unicorn_dense/wandb/run-20260113_222407-vne3r4hs/files/output.log b/Unicorn_dense/wandb/run-20260113_222407-vne3r4hs/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..b0a6594e02de84e3d354af32327b16230aa6a3cb --- /dev/null +++ b/Unicorn_dense/wandb/run-20260113_222407-vne3r4hs/files/output.log @@ -0,0 +1,24 @@ + 0%| | 0/3906 [00:00', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'no', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'eval_use_gather_object': False, 'cache_dir': None, 'mpt_attn_impl': 'triton', 'model_max_length': 2048, 'group_by_modality_length': False} +2026-01-13 22:24:09,280 INFO MainThread:1839955 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8052289536 - > +2026-01-13 22:24:09,280 INFO MainThread:1839955 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 8052289536 None diff --git a/Unicorn_dense/wandb/run-20260113_222407-vne3r4hs/run-vne3r4hs.wandb b/Unicorn_dense/wandb/run-20260113_222407-vne3r4hs/run-vne3r4hs.wandb new file mode 100644 index 0000000000000000000000000000000000000000..d5ff2692fc567f2c8c59b4f9f56971597fdb3afc Binary files /dev/null and b/Unicorn_dense/wandb/run-20260113_222407-vne3r4hs/run-vne3r4hs.wandb differ diff --git a/Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/files/config.yaml b/Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2435de14d62950aecc6f41ec2bbb463e403b8822 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/files/config.yaml @@ -0,0 +1,617 @@ +_name_or_path: + value: /llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct +_wandb: + value: + cli_version: 0.23.1 + e: + mwnl7hrs0a392l5mmm095780to1kdu6d: + args: + - --local_rank=0 + - --deepspeed + - ./script/deepspeed/zero2.json + - --model_name_or_path + - /llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct + - --model_type + - llama3-8b + - --version + - plain + - --data_path + - /llm-align/liuchonghan/xiaomin/data/densefusion/densefusion_pretrain_ours.json + - --mm_projector_type + - mlp2x_gelu + - --tune_mm_mlp_adapter + - "True" + - --image_aspect_ratio + - square + - --bf16 + - "True" + - --output_dir + - /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain + - --num_train_epochs + - "1" + - --per_device_train_batch_size + - "8" + - --per_device_eval_batch_size + - "4" + - --gradient_accumulation_steps + - "4" + - --evaluation_strategy + - "no" + - --save_strategy + - steps + - --save_steps + - "100000" + - --save_total_limit + - "1" + - --learning_rate + - "5e-4" + - --weight_decay + - "0." + - --warmup_ratio + - "0.03" + - --lr_scheduler_type + - cosine + - --logging_steps + - "1" + - --tf32 + - "True" + - --model_max_length + - "2048" + - --gradient_checkpointing + - "True" + - --dataloader_num_workers + - "4" + - --lazy_preprocess + - "True" + - --report_to + - wandb + codePath: bunny/train/train.py + codePathLocal: bunny/train/train.py + cpu_count: 128 + cpu_count_logical: 255 + cudaVersion: "12.4" + disk: + /: + total: "53687091200" + used: "15148806144" + email: yuxm02@gmail.com + executable: /llm-align/liuchonghan/env/envs/bunny/bin/python + gpu: NVIDIA A800-SXM4-80GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-a3fc74bf-1f17-5efa-f250-bcbcbcf95ece + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-de894aa2-80ed-5287-811c-51192fa8348e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-3c6bdd46-409d-a93f-e9b7-a124938ce17e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-6e5234d7-e2d4-69d9-3213-cc1239573e6d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-0e81592d-2c75-2476-c37d-e8d1d4fb2d65 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-d2f05526-805c-8dec-ae5f-ff04f3cec8f5 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-b3f69678-17b2-d1eb-c163-f104c5f03d77 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-9bf33294-cd7e-ffd9-184d-02e4b5d18550 + host: h-liuchonghan-rler1225-a800-a100-2nodes-m-0 + memory: + total: "2164358094848" + os: Linux-5.4.0-42-generic-x86_64-with-glibc2.35 + program: /llm-align/liuchonghan/xiaomin/Unicorn_dense/bunny/train/train.py + python: CPython 3.10.19 + root: /llm-align/liuchonghan/xiaomin/Unicorn_dense + startedAt: "2026-01-13T14:49:42.351538Z" + writerId: mwnl7hrs0a392l5mmm095780to1kdu6d + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "2": '*' + "5": 1 + "6": + - 1 + "7": [] + python_version: 3.10.19 + t: + "1": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 63 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 63 + - 71 + - 98 + "3": + - 7 + - 13 + - 19 + - 62 + - 66 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.44.0 + "9": + "1": transformers_trainer + "12": 0.23.1 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +add_cross_attention: + value: false +architectures: + value: + - LlamaForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +bad_words_ids: + value: null +batch_eval_metrics: + value: false +begin_suppress_tokens: + value: null +bf16: + value: true +bf16_full_eval: + value: false +bos_token_id: + value: 128000 +cache_dir: + value: null +chunk_size_feed_forward: + value: 0 +cross_attention_hidden_size: + value: null +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 4 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +decoder_start_token_id: + value: null +deepspeed: + value: ./script/deepspeed/zero2.json +disable_tqdm: + value: false +dispatch_batches: + value: null +diversity_penalty: + value: 0 +do_eval: + value: false +do_predict: + value: false +do_sample: + value: false +do_train: + value: false +early_stopping: + value: false +encoder_no_repeat_ngram_size: + value: 0 +eos_token_id: + value: 128001 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: "no" +eval_use_gather_object: + value: false +evaluation_strategy: + value: "no" +exponential_decay_length_penalty: + value: null +finetuning_task: + value: null +forced_bos_token_id: + value: null +forced_eos_token_id: + value: null +fp16: + value: false +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +freeze_mm_mlp_adapter: + value: false +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +group_by_modality_length: + value: false +half_precision_backend: + value: auto +hidden_act: + value: silu +hidden_size: + value: 4096 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +image_aspect_ratio: + value: square +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +initializer_range: + value: 0.02 +intermediate_size: + value: 14336 +is_decoder: + value: false +is_encoder_decoder: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +learning_rate: + value: 0.0005 +length_column_name: + value: length +length_penalty: + value: 1 +load_best_model_at_end: + value: false +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain/runs/Jan13_22-46-57_h-liuchonghan-rler1225-a800-a100-2nodes-m-0 +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr_scheduler_type: + value: cosine +max_grad_norm: + value: 1 +max_length: + value: 20 +max_position_embeddings: + value: 8192 +max_steps: + value: -1 +metric_for_best_model: + value: null +min_length: + value: 0 +mm_hidden_size: + value: 1280 +mm_projector_lr: + value: null +mm_projector_type: + value: mlp2x_gelu +model/num_parameters: + value: 8052289536 +model_max_length: + value: 2048 +model_type: + value: bunny-llama +mp_parameters: + value: "" +mpt_attn_impl: + value: triton +neftune_noise_alpha: + value: null +no_cuda: + value: false +no_repeat_ngram_size: + value: 0 +num_attention_heads: + value: 32 +num_beam_groups: + value: 1 +num_beams: + value: 1 +num_hidden_layers: + value: 32 +num_key_value_heads: + value: 8 +num_return_sequences: + value: 1 +num_train_epochs: + value: 1 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain +output_hidden_states: + value: false +output_scores: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: null +past_index: + value: -1 +per_device_eval_batch_size: + value: 4 +per_device_train_batch_size: + value: 8 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +prediction_loss_only: + value: false +prefix: + value: null +pretraining_tp: + value: 1 +problem_type: + value: null +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_invalid_values: + value: false +remove_unused_columns: + value: false +repetition_penalty: + value: 1 +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +return_dict_in_generate: + value: false +rms_norm_eps: + value: 1e-05 +rope_scaling: + value: null +rope_theta: + value: 500000 +run_name: + value: /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: true +save_steps: + value: 100000 +save_strategy: + value: steps +save_total_limit: + value: 1 +seed: + value: 42 +sep_token_id: + value: null +skip_memory_metrics: + value: true +split_batches: + value: null +suppress_tokens: + value: null +task_specific_params: + value: null +temperature: + value: 1 +tf_legacy_loss: + value: false +tf32: + value: true +tie_encoder_decoder: + value: false +tie_word_embeddings: + value: false +tokenizer_class: + value: null +tokenizer_model_max_length: + value: 2048 +tokenizer_padding_side: + value: right +top_k: + value: 50 +top_p: + value: 1 +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_dtype: + value: bfloat16 +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +torchscript: + value: false +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +transformers_version: + value: 4.44.0 +tune_mm_mlp_adapter: + value: true +typical_p: + value: 1 +use_bfloat16: + value: false +use_cache: + value: false +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_mm_proj: + value: true +use_mps_device: + value: false +use_s2: + value: false +vocab_size: + value: 128256 +warmup_ratio: + value: 0.03 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/files/output.log b/Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..73bfb6a24017f4e7201690642cd0219b04636814 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/files/output.log @@ -0,0 +1,3910 @@ + 0%| | 0/3906 [00:00\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-14T00:15:32.056821504+08:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/idx5bv8k/file_stream","body":"{\"error\":\"context deadline exceeded\"}"} +{"time":"2026-01-14T00:16:40.030436551+08:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/idx5bv8k/file_stream","body":"{\"error\":\"context deadline exceeded\"}"} +{"time":"2026-01-14T01:33:46.502424149+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/idx5bv8k/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-14T01:36:18.839797789+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/idx5bv8k/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-14T01:50:02.144675052+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/idx5bv8k/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-14T01:55:17.544745481+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/idx5bv8k/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-14T04:40:17.278528552+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/idx5bv8k/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-14T07:58:25.26478688+08:00","level":"INFO","msg":"stream: closing","id":"idx5bv8k"} +{"time":"2026-01-14T07:58:27.632916706+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-01-14T07:58:28.194288215+08:00","level":"INFO","msg":"handler: closed","stream_id":"idx5bv8k"} +{"time":"2026-01-14T07:58:28.194425883+08:00","level":"INFO","msg":"sender: closed","stream_id":"idx5bv8k"} +{"time":"2026-01-14T07:58:28.194453752+08:00","level":"INFO","msg":"stream: closed","id":"idx5bv8k"} diff --git a/Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/logs/debug.log b/Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..0bd4e7c7244ec8e83214ba3fdca81e7ea428de45 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/logs/debug.log @@ -0,0 +1,26 @@ +2026-01-13 22:49:42,362 INFO MainThread:206956 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2026-01-13 22:49:42,362 INFO MainThread:206956 [wandb_setup.py:_flush():80] Configure stats pid to 206956 +2026-01-13 22:49:42,362 INFO MainThread:206956 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2026-01-13 22:49:42,362 INFO MainThread:206956 [wandb_setup.py:_flush():80] Loading settings from /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/settings +2026-01-13 22:49:42,362 INFO MainThread:206956 [wandb_setup.py:_flush():80] Loading settings from environment variables +2026-01-13 22:49:42,362 INFO MainThread:206956 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/logs/debug.log +2026-01-13 22:49:42,363 INFO MainThread:206956 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/logs/debug-internal.log +2026-01-13 22:49:42,363 INFO MainThread:206956 [wandb_init.py:init():841] calling init triggers +2026-01-13 22:49:42,363 INFO MainThread:206956 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-01-13 22:49:42,363 INFO MainThread:206956 [wandb_init.py:init():889] starting backend +2026-01-13 22:49:43,476 INFO MainThread:206956 [wandb_init.py:init():892] sending inform_init request +2026-01-13 22:49:43,504 INFO MainThread:206956 [wandb_init.py:init():900] backend started and connected +2026-01-13 22:49:43,505 INFO MainThread:206956 [wandb_init.py:init():970] updated telemetry +2026-01-13 22:49:43,507 INFO MainThread:206956 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2026-01-13 22:49:51,043 INFO MainThread:206956 [wandb_init.py:init():1041] starting run threads in backend +2026-01-13 22:49:51,498 INFO MainThread:206956 [wandb_run.py:_console_start():2521] atexit reg +2026-01-13 22:49:51,498 INFO MainThread:206956 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2026-01-13 22:49:51,498 INFO MainThread:206956 [wandb_run.py:_redirect():2438] Wrapping output streams. +2026-01-13 22:49:51,498 INFO MainThread:206956 [wandb_run.py:_redirect():2461] Redirects installed. +2026-01-13 22:49:51,509 INFO MainThread:206956 [wandb_init.py:init():1081] run started, returning control to user process +2026-01-13 22:49:51,512 INFO MainThread:206956 [wandb_run.py:_config_callback():1396] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 8192, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 500000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '/llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct', 'transformers_version': '4.44.0', 'model_type': 'bunny-llama', 'use_mm_proj': True, 'mm_projector_type': 'mlp2x_gelu', 'mm_hidden_size': 1280, 'image_aspect_ratio': 'square', 'tokenizer_padding_side': 'right', 'tokenizer_model_max_length': 2048, 'tune_mm_mlp_adapter': True, 'freeze_mm_mlp_adapter': False, 'mm_projector_lr': None, 'use_s2': False, 'output_dir': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0005, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.03, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain/runs/Jan13_22-46-57_h-liuchonghan-rler1225-a800-a100-2nodes-m-0', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100000, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': True, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': './script/deepspeed/zero2.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'no', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'eval_use_gather_object': False, 'cache_dir': None, 'mpt_attn_impl': 'triton', 'model_max_length': 2048, 'group_by_modality_length': False} +2026-01-13 22:49:51,531 INFO MainThread:206956 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8052289536 - > +2026-01-13 22:49:51,531 INFO MainThread:206956 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 8052289536 None +2026-01-14 07:58:25,265 INFO wandb-AsyncioManager-main:206956 [service_client.py:_forward_responses():80] Reached EOF. +2026-01-14 07:58:25,265 INFO wandb-AsyncioManager-main:206956 [mailbox.py:close():137] Closing mailbox, abandoning 2 handles. diff --git a/Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/run-idx5bv8k.wandb b/Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/run-idx5bv8k.wandb new file mode 100644 index 0000000000000000000000000000000000000000..84bb96104438939dbc2b3ee8904175822e143ced --- /dev/null +++ b/Unicorn_dense/wandb/run-20260113_224942-idx5bv8k/run-idx5bv8k.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d4df5e02673a460677e236b12aaa85d1a1b7b78eb9320dbd8f2faac33194c2b +size 16957751 diff --git a/Unicorn_dense/wandb/run-20260114_135602-iex36np1/files/output.log b/Unicorn_dense/wandb/run-20260114_135602-iex36np1/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..bf01e6b9cac1abaa43a8f448ab389d1cc93da461 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260114_135602-iex36np1/files/output.log @@ -0,0 +1,1376 @@ + 0%| | 0/3906 [00:00', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'no', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'eval_use_gather_object': False, 'cache_dir': None, 'mpt_attn_impl': 'triton', 'model_max_length': 2048, 'group_by_modality_length': False} +2026-01-14 13:56:04,894 INFO MainThread:280435 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8052289536 - > +2026-01-14 13:56:04,894 INFO MainThread:280435 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 8052289536 None diff --git a/Unicorn_dense/wandb/run-20260114_135602-iex36np1/run-iex36np1.wandb b/Unicorn_dense/wandb/run-20260114_135602-iex36np1/run-iex36np1.wandb new file mode 100644 index 0000000000000000000000000000000000000000..0676f32d0c991c7f3a38649504c7a2ea6481f5b9 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260114_135602-iex36np1/run-iex36np1.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f72d9cb9471bd6c6acf00d28323cb337004335913889579f052c8b0000aea438 +size 5570560 diff --git a/Unicorn_dense/wandb/run-20260114_170827-as22o934/files/config.yaml b/Unicorn_dense/wandb/run-20260114_170827-as22o934/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ee81741c1787ac7f0fa56eee92ddf653830ea140 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260114_170827-as22o934/files/config.yaml @@ -0,0 +1,617 @@ +_name_or_path: + value: /llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct +_wandb: + value: + cli_version: 0.23.1 + e: + tbevn34tjls6va6joh8frfsgpapnqymx: + args: + - --local_rank=0 + - --deepspeed + - ./script/deepspeed/zero2.json + - --model_name_or_path + - /llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct + - --model_type + - llama3-8b + - --version + - plain + - --data_path + - /llm-align/liuchonghan/xiaomin/data/densefusion/densefusion_pretrain_ours.json + - --mm_projector_type + - mlp2x_gelu + - --tune_mm_mlp_adapter + - "True" + - --image_aspect_ratio + - square + - --bf16 + - "True" + - --output_dir + - /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain + - --num_train_epochs + - "1" + - --per_device_train_batch_size + - "8" + - --per_device_eval_batch_size + - "4" + - --gradient_accumulation_steps + - "4" + - --evaluation_strategy + - "no" + - --save_strategy + - steps + - --save_steps + - "100000" + - --save_total_limit + - "1" + - --learning_rate + - "5e-4" + - --weight_decay + - "0." + - --warmup_ratio + - "0.03" + - --lr_scheduler_type + - cosine + - --logging_steps + - "1" + - --tf32 + - "True" + - --model_max_length + - "2048" + - --gradient_checkpointing + - "True" + - --dataloader_num_workers + - "4" + - --lazy_preprocess + - "True" + - --report_to + - wandb + codePath: bunny/train/train.py + codePathLocal: bunny/train/train.py + cpu_count: 128 + cpu_count_logical: 255 + cudaVersion: "12.4" + disk: + /: + total: "53687091200" + used: "15148818432" + email: yuxm02@gmail.com + executable: /llm-align/liuchonghan/env/envs/bunny/bin/python + gpu: NVIDIA A800-SXM4-80GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-a3fc74bf-1f17-5efa-f250-bcbcbcf95ece + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-de894aa2-80ed-5287-811c-51192fa8348e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-3c6bdd46-409d-a93f-e9b7-a124938ce17e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-6e5234d7-e2d4-69d9-3213-cc1239573e6d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-0e81592d-2c75-2476-c37d-e8d1d4fb2d65 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-d2f05526-805c-8dec-ae5f-ff04f3cec8f5 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-b3f69678-17b2-d1eb-c163-f104c5f03d77 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-9bf33294-cd7e-ffd9-184d-02e4b5d18550 + host: h-liuchonghan-rler1225-a800-a100-2nodes-m-0 + memory: + total: "2164358094848" + os: Linux-5.4.0-42-generic-x86_64-with-glibc2.35 + program: /llm-align/liuchonghan/xiaomin/Unicorn_dense/bunny/train/train.py + python: CPython 3.10.19 + root: /llm-align/liuchonghan/xiaomin/Unicorn_dense + startedAt: "2026-01-14T09:08:27.516123Z" + writerId: tbevn34tjls6va6joh8frfsgpapnqymx + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "2": '*' + "5": 1 + "6": + - 1 + "7": [] + python_version: 3.10.19 + t: + "1": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 63 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 63 + - 71 + - 98 + "3": + - 7 + - 13 + - 19 + - 62 + - 66 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.44.0 + "9": + "1": transformers_trainer + "12": 0.23.1 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +add_cross_attention: + value: false +architectures: + value: + - LlamaForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +bad_words_ids: + value: null +batch_eval_metrics: + value: false +begin_suppress_tokens: + value: null +bf16: + value: true +bf16_full_eval: + value: false +bos_token_id: + value: 128000 +cache_dir: + value: null +chunk_size_feed_forward: + value: 0 +cross_attention_hidden_size: + value: null +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 4 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +decoder_start_token_id: + value: null +deepspeed: + value: ./script/deepspeed/zero2.json +disable_tqdm: + value: false +dispatch_batches: + value: null +diversity_penalty: + value: 0 +do_eval: + value: false +do_predict: + value: false +do_sample: + value: false +do_train: + value: false +early_stopping: + value: false +encoder_no_repeat_ngram_size: + value: 0 +eos_token_id: + value: 128001 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: "no" +eval_use_gather_object: + value: false +evaluation_strategy: + value: "no" +exponential_decay_length_penalty: + value: null +finetuning_task: + value: null +forced_bos_token_id: + value: null +forced_eos_token_id: + value: null +fp16: + value: false +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +freeze_mm_mlp_adapter: + value: false +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +group_by_modality_length: + value: false +half_precision_backend: + value: auto +hidden_act: + value: silu +hidden_size: + value: 4096 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +image_aspect_ratio: + value: square +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +initializer_range: + value: 0.02 +intermediate_size: + value: 14336 +is_decoder: + value: false +is_encoder_decoder: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +learning_rate: + value: 0.0005 +length_column_name: + value: length +length_penalty: + value: 1 +load_best_model_at_end: + value: false +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain/runs/Jan14_17-05-42_h-liuchonghan-rler1225-a800-a100-2nodes-m-0 +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr_scheduler_type: + value: cosine +max_grad_norm: + value: 1 +max_length: + value: 20 +max_position_embeddings: + value: 8192 +max_steps: + value: -1 +metric_for_best_model: + value: null +min_length: + value: 0 +mm_hidden_size: + value: 1280 +mm_projector_lr: + value: null +mm_projector_type: + value: mlp2x_gelu +model/num_parameters: + value: 8052289536 +model_max_length: + value: 2048 +model_type: + value: bunny-llama +mp_parameters: + value: "" +mpt_attn_impl: + value: triton +neftune_noise_alpha: + value: null +no_cuda: + value: false +no_repeat_ngram_size: + value: 0 +num_attention_heads: + value: 32 +num_beam_groups: + value: 1 +num_beams: + value: 1 +num_hidden_layers: + value: 32 +num_key_value_heads: + value: 8 +num_return_sequences: + value: 1 +num_train_epochs: + value: 1 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain +output_hidden_states: + value: false +output_scores: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: null +past_index: + value: -1 +per_device_eval_batch_size: + value: 4 +per_device_train_batch_size: + value: 8 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +prediction_loss_only: + value: false +prefix: + value: null +pretraining_tp: + value: 1 +problem_type: + value: null +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_invalid_values: + value: false +remove_unused_columns: + value: false +repetition_penalty: + value: 1 +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +return_dict_in_generate: + value: false +rms_norm_eps: + value: 1e-05 +rope_scaling: + value: null +rope_theta: + value: 500000 +run_name: + value: /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: true +save_steps: + value: 100000 +save_strategy: + value: steps +save_total_limit: + value: 1 +seed: + value: 42 +sep_token_id: + value: null +skip_memory_metrics: + value: true +split_batches: + value: null +suppress_tokens: + value: null +task_specific_params: + value: null +temperature: + value: 1 +tf_legacy_loss: + value: false +tf32: + value: true +tie_encoder_decoder: + value: false +tie_word_embeddings: + value: false +tokenizer_class: + value: null +tokenizer_model_max_length: + value: 2048 +tokenizer_padding_side: + value: right +top_k: + value: 50 +top_p: + value: 1 +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_dtype: + value: bfloat16 +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +torchscript: + value: false +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +transformers_version: + value: 4.44.0 +tune_mm_mlp_adapter: + value: true +typical_p: + value: 1 +use_bfloat16: + value: false +use_cache: + value: false +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_mm_proj: + value: true +use_mps_device: + value: false +use_s2: + value: false +vocab_size: + value: 128256 +warmup_ratio: + value: 0.03 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/Unicorn_dense/wandb/run-20260114_170827-as22o934/files/output.log b/Unicorn_dense/wandb/run-20260114_170827-as22o934/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..71eb88bba8993d9f83f15c2cb0460d7390502676 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260114_170827-as22o934/files/output.log @@ -0,0 +1,3910 @@ + 0%| | 0/3906 [00:00\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-14T23:29:54.76461508+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/as22o934/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-14T23:38:28.980358956+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/as22o934/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-14T23:38:51.166330275+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/as22o934/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-14T23:45:34.41006927+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/as22o934/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-15T01:46:46.659008114+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/as22o934/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-15T02:16:55.6489901+08:00","level":"INFO","msg":"stream: closing","id":"as22o934"} +{"time":"2026-01-15T02:16:58.039306545+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-01-15T02:16:58.637401859+08:00","level":"INFO","msg":"handler: closed","stream_id":"as22o934"} +{"time":"2026-01-15T02:16:58.637553916+08:00","level":"INFO","msg":"sender: closed","stream_id":"as22o934"} +{"time":"2026-01-15T02:16:58.637584719+08:00","level":"INFO","msg":"stream: closed","id":"as22o934"} diff --git a/Unicorn_dense/wandb/run-20260114_170827-as22o934/logs/debug.log b/Unicorn_dense/wandb/run-20260114_170827-as22o934/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..a165cd4acc65deb96d666b6fa9a04d6e796a8502 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260114_170827-as22o934/logs/debug.log @@ -0,0 +1,26 @@ +2026-01-14 17:08:27,523 INFO MainThread:305207 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2026-01-14 17:08:27,523 INFO MainThread:305207 [wandb_setup.py:_flush():80] Configure stats pid to 305207 +2026-01-14 17:08:27,523 INFO MainThread:305207 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2026-01-14 17:08:27,523 INFO MainThread:305207 [wandb_setup.py:_flush():80] Loading settings from /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/settings +2026-01-14 17:08:27,523 INFO MainThread:305207 [wandb_setup.py:_flush():80] Loading settings from environment variables +2026-01-14 17:08:27,524 INFO MainThread:305207 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/run-20260114_170827-as22o934/logs/debug.log +2026-01-14 17:08:27,524 INFO MainThread:305207 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/run-20260114_170827-as22o934/logs/debug-internal.log +2026-01-14 17:08:27,524 INFO MainThread:305207 [wandb_init.py:init():841] calling init triggers +2026-01-14 17:08:27,524 INFO MainThread:305207 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-01-14 17:08:27,524 INFO MainThread:305207 [wandb_init.py:init():889] starting backend +2026-01-14 17:08:28,602 INFO MainThread:305207 [wandb_init.py:init():892] sending inform_init request +2026-01-14 17:08:28,617 INFO MainThread:305207 [wandb_init.py:init():900] backend started and connected +2026-01-14 17:08:28,619 INFO MainThread:305207 [wandb_init.py:init():970] updated telemetry +2026-01-14 17:08:28,620 INFO MainThread:305207 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2026-01-14 17:08:29,735 INFO MainThread:305207 [wandb_init.py:init():1041] starting run threads in backend +2026-01-14 17:08:30,125 INFO MainThread:305207 [wandb_run.py:_console_start():2521] atexit reg +2026-01-14 17:08:30,125 INFO MainThread:305207 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2026-01-14 17:08:30,126 INFO MainThread:305207 [wandb_run.py:_redirect():2438] Wrapping output streams. +2026-01-14 17:08:30,126 INFO MainThread:305207 [wandb_run.py:_redirect():2461] Redirects installed. +2026-01-14 17:08:30,147 INFO MainThread:305207 [wandb_init.py:init():1081] run started, returning control to user process +2026-01-14 17:08:30,150 INFO MainThread:305207 [wandb_run.py:_config_callback():1396] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 8192, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 500000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '/llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct', 'transformers_version': '4.44.0', 'model_type': 'bunny-llama', 'use_mm_proj': True, 'mm_projector_type': 'mlp2x_gelu', 'mm_hidden_size': 1280, 'image_aspect_ratio': 'square', 'tokenizer_padding_side': 'right', 'tokenizer_model_max_length': 2048, 'tune_mm_mlp_adapter': True, 'freeze_mm_mlp_adapter': False, 'mm_projector_lr': None, 'use_s2': False, 'output_dir': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0005, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.03, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain/runs/Jan14_17-05-42_h-liuchonghan-rler1225-a800-a100-2nodes-m-0', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100000, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': True, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': './script/deepspeed/zero2.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'no', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'eval_use_gather_object': False, 'cache_dir': None, 'mpt_attn_impl': 'triton', 'model_max_length': 2048, 'group_by_modality_length': False} +2026-01-14 17:08:30,167 INFO MainThread:305207 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8052289536 - > +2026-01-14 17:08:30,167 INFO MainThread:305207 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 8052289536 None +2026-01-15 02:16:55,649 INFO wandb-AsyncioManager-main:305207 [service_client.py:_forward_responses():80] Reached EOF. +2026-01-15 02:16:55,649 INFO wandb-AsyncioManager-main:305207 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Unicorn_dense/wandb/run-20260114_170827-as22o934/run-as22o934.wandb b/Unicorn_dense/wandb/run-20260114_170827-as22o934/run-as22o934.wandb new file mode 100644 index 0000000000000000000000000000000000000000..f02774c6d46da0f218b3b6786f2573fa518a9b4d --- /dev/null +++ b/Unicorn_dense/wandb/run-20260114_170827-as22o934/run-as22o934.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24086cc3f2e74cdec95e40c37750007ce9938d8a4a48b8decb7db6f1ff42039d +size 16979219 diff --git a/Unicorn_dense/wandb/run-20260115_103508-pagdc54a/files/config.yaml b/Unicorn_dense/wandb/run-20260115_103508-pagdc54a/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..48c33206460711e2ee9fe64294495a6795886988 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260115_103508-pagdc54a/files/config.yaml @@ -0,0 +1,617 @@ +_name_or_path: + value: /llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct +_wandb: + value: + cli_version: 0.23.1 + e: + zthw7aswy8gzcyfswe99b65q6ei75utt: + args: + - --local_rank=0 + - --deepspeed + - ./script/deepspeed/zero2.json + - --model_name_or_path + - /llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct + - --model_type + - llama3-8b + - --version + - plain + - --data_path + - /llm-align/liuchonghan/xiaomin/data/densefusion/densefusion_pretrain_ours.json + - --mm_projector_type + - mlp2x_gelu + - --tune_mm_mlp_adapter + - "True" + - --image_aspect_ratio + - square + - --bf16 + - "True" + - --output_dir + - /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain + - --num_train_epochs + - "1" + - --per_device_train_batch_size + - "8" + - --per_device_eval_batch_size + - "4" + - --gradient_accumulation_steps + - "4" + - --evaluation_strategy + - "no" + - --save_strategy + - steps + - --save_steps + - "100000" + - --save_total_limit + - "1" + - --learning_rate + - "5e-4" + - --weight_decay + - "0." + - --warmup_ratio + - "0.03" + - --lr_scheduler_type + - cosine + - --logging_steps + - "1" + - --tf32 + - "True" + - --model_max_length + - "2048" + - --gradient_checkpointing + - "True" + - --dataloader_num_workers + - "4" + - --lazy_preprocess + - "True" + - --report_to + - wandb + codePath: bunny/train/train.py + codePathLocal: bunny/train/train.py + cpu_count: 128 + cpu_count_logical: 255 + cudaVersion: "12.4" + disk: + /: + total: "53687091200" + used: "15148822528" + email: yuxm02@gmail.com + executable: /llm-align/liuchonghan/env/envs/bunny/bin/python + gpu: NVIDIA A800-SXM4-80GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-a3fc74bf-1f17-5efa-f250-bcbcbcf95ece + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-de894aa2-80ed-5287-811c-51192fa8348e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-3c6bdd46-409d-a93f-e9b7-a124938ce17e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-6e5234d7-e2d4-69d9-3213-cc1239573e6d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-0e81592d-2c75-2476-c37d-e8d1d4fb2d65 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-d2f05526-805c-8dec-ae5f-ff04f3cec8f5 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-b3f69678-17b2-d1eb-c163-f104c5f03d77 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-9bf33294-cd7e-ffd9-184d-02e4b5d18550 + host: h-liuchonghan-rler1225-a800-a100-2nodes-m-0 + memory: + total: "2164358094848" + os: Linux-5.4.0-42-generic-x86_64-with-glibc2.35 + program: /llm-align/liuchonghan/xiaomin/Unicorn_dense/bunny/train/train.py + python: CPython 3.10.19 + root: /llm-align/liuchonghan/xiaomin/Unicorn_dense + startedAt: "2026-01-15T02:35:08.598950Z" + writerId: zthw7aswy8gzcyfswe99b65q6ei75utt + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "2": '*' + "5": 1 + "6": + - 1 + "7": [] + python_version: 3.10.19 + t: + "1": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 63 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 63 + - 71 + - 98 + "3": + - 7 + - 13 + - 19 + - 62 + - 66 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.44.0 + "9": + "1": transformers_trainer + "12": 0.23.1 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +add_cross_attention: + value: false +architectures: + value: + - LlamaForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +bad_words_ids: + value: null +batch_eval_metrics: + value: false +begin_suppress_tokens: + value: null +bf16: + value: true +bf16_full_eval: + value: false +bos_token_id: + value: 128000 +cache_dir: + value: null +chunk_size_feed_forward: + value: 0 +cross_attention_hidden_size: + value: null +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 4 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +decoder_start_token_id: + value: null +deepspeed: + value: ./script/deepspeed/zero2.json +disable_tqdm: + value: false +dispatch_batches: + value: null +diversity_penalty: + value: 0 +do_eval: + value: false +do_predict: + value: false +do_sample: + value: false +do_train: + value: false +early_stopping: + value: false +encoder_no_repeat_ngram_size: + value: 0 +eos_token_id: + value: 128001 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: "no" +eval_use_gather_object: + value: false +evaluation_strategy: + value: "no" +exponential_decay_length_penalty: + value: null +finetuning_task: + value: null +forced_bos_token_id: + value: null +forced_eos_token_id: + value: null +fp16: + value: false +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +freeze_mm_mlp_adapter: + value: false +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +group_by_modality_length: + value: false +half_precision_backend: + value: auto +hidden_act: + value: silu +hidden_size: + value: 4096 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +image_aspect_ratio: + value: square +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +initializer_range: + value: 0.02 +intermediate_size: + value: 14336 +is_decoder: + value: false +is_encoder_decoder: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +learning_rate: + value: 0.0005 +length_column_name: + value: length +length_penalty: + value: 1 +load_best_model_at_end: + value: false +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain/runs/Jan15_10-32-31_h-liuchonghan-rler1225-a800-a100-2nodes-m-0 +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr_scheduler_type: + value: cosine +max_grad_norm: + value: 1 +max_length: + value: 20 +max_position_embeddings: + value: 8192 +max_steps: + value: -1 +metric_for_best_model: + value: null +min_length: + value: 0 +mm_hidden_size: + value: 1280 +mm_projector_lr: + value: null +mm_projector_type: + value: mlp2x_gelu +model/num_parameters: + value: 8052289536 +model_max_length: + value: 2048 +model_type: + value: bunny-llama +mp_parameters: + value: "" +mpt_attn_impl: + value: triton +neftune_noise_alpha: + value: null +no_cuda: + value: false +no_repeat_ngram_size: + value: 0 +num_attention_heads: + value: 32 +num_beam_groups: + value: 1 +num_beams: + value: 1 +num_hidden_layers: + value: 32 +num_key_value_heads: + value: 8 +num_return_sequences: + value: 1 +num_train_epochs: + value: 1 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain +output_hidden_states: + value: false +output_scores: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: null +past_index: + value: -1 +per_device_eval_batch_size: + value: 4 +per_device_train_batch_size: + value: 8 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +prediction_loss_only: + value: false +prefix: + value: null +pretraining_tp: + value: 1 +problem_type: + value: null +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_invalid_values: + value: false +remove_unused_columns: + value: false +repetition_penalty: + value: 1 +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +return_dict_in_generate: + value: false +rms_norm_eps: + value: 1e-05 +rope_scaling: + value: null +rope_theta: + value: 500000 +run_name: + value: /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: true +save_steps: + value: 100000 +save_strategy: + value: steps +save_total_limit: + value: 1 +seed: + value: 42 +sep_token_id: + value: null +skip_memory_metrics: + value: true +split_batches: + value: null +suppress_tokens: + value: null +task_specific_params: + value: null +temperature: + value: 1 +tf_legacy_loss: + value: false +tf32: + value: true +tie_encoder_decoder: + value: false +tie_word_embeddings: + value: false +tokenizer_class: + value: null +tokenizer_model_max_length: + value: 2048 +tokenizer_padding_side: + value: right +top_k: + value: 50 +top_p: + value: 1 +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_dtype: + value: bfloat16 +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +torchscript: + value: false +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +transformers_version: + value: 4.44.0 +tune_mm_mlp_adapter: + value: true +typical_p: + value: 1 +use_bfloat16: + value: false +use_cache: + value: false +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_mm_proj: + value: true +use_mps_device: + value: false +use_s2: + value: false +vocab_size: + value: 128256 +warmup_ratio: + value: 0.03 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/Unicorn_dense/wandb/run-20260115_103508-pagdc54a/files/output.log b/Unicorn_dense/wandb/run-20260115_103508-pagdc54a/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..f09128fc534fa2c6e785e6282e7f7c1817909d4b --- /dev/null +++ b/Unicorn_dense/wandb/run-20260115_103508-pagdc54a/files/output.log @@ -0,0 +1,3910 @@ + 0%| | 0/3906 [00:00\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-15T12:45:26.66958665+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/pagdc54a/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-15T19:07:27.724441083+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/pagdc54a/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-15T19:43:37.951517589+08:00","level":"INFO","msg":"stream: closing","id":"pagdc54a"} +{"time":"2026-01-15T19:43:40.395252685+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-01-15T19:43:41.028823355+08:00","level":"INFO","msg":"handler: closed","stream_id":"pagdc54a"} +{"time":"2026-01-15T19:43:41.028955435+08:00","level":"INFO","msg":"sender: closed","stream_id":"pagdc54a"} +{"time":"2026-01-15T19:43:41.028971639+08:00","level":"INFO","msg":"stream: closed","id":"pagdc54a"} diff --git a/Unicorn_dense/wandb/run-20260115_103508-pagdc54a/logs/debug.log b/Unicorn_dense/wandb/run-20260115_103508-pagdc54a/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..998cb7145c8e8ce001152b53ff956e19aec1d522 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260115_103508-pagdc54a/logs/debug.log @@ -0,0 +1,26 @@ +2026-01-15 10:35:08,607 INFO MainThread:383416 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2026-01-15 10:35:08,607 INFO MainThread:383416 [wandb_setup.py:_flush():80] Configure stats pid to 383416 +2026-01-15 10:35:08,607 INFO MainThread:383416 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2026-01-15 10:35:08,607 INFO MainThread:383416 [wandb_setup.py:_flush():80] Loading settings from /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/settings +2026-01-15 10:35:08,607 INFO MainThread:383416 [wandb_setup.py:_flush():80] Loading settings from environment variables +2026-01-15 10:35:08,607 INFO MainThread:383416 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/run-20260115_103508-pagdc54a/logs/debug.log +2026-01-15 10:35:08,607 INFO MainThread:383416 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/run-20260115_103508-pagdc54a/logs/debug-internal.log +2026-01-15 10:35:08,607 INFO MainThread:383416 [wandb_init.py:init():841] calling init triggers +2026-01-15 10:35:08,607 INFO MainThread:383416 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-01-15 10:35:08,607 INFO MainThread:383416 [wandb_init.py:init():889] starting backend +2026-01-15 10:35:09,004 INFO MainThread:383416 [wandb_init.py:init():892] sending inform_init request +2026-01-15 10:35:09,016 INFO MainThread:383416 [wandb_init.py:init():900] backend started and connected +2026-01-15 10:35:09,019 INFO MainThread:383416 [wandb_init.py:init():970] updated telemetry +2026-01-15 10:35:09,021 INFO MainThread:383416 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2026-01-15 10:35:10,208 INFO MainThread:383416 [wandb_init.py:init():1041] starting run threads in backend +2026-01-15 10:35:10,455 INFO MainThread:383416 [wandb_run.py:_console_start():2521] atexit reg +2026-01-15 10:35:10,455 INFO MainThread:383416 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2026-01-15 10:35:10,456 INFO MainThread:383416 [wandb_run.py:_redirect():2438] Wrapping output streams. +2026-01-15 10:35:10,456 INFO MainThread:383416 [wandb_run.py:_redirect():2461] Redirects installed. +2026-01-15 10:35:10,464 INFO MainThread:383416 [wandb_init.py:init():1081] run started, returning control to user process +2026-01-15 10:35:10,468 INFO MainThread:383416 [wandb_run.py:_config_callback():1396] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 8192, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 500000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '/llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct', 'transformers_version': '4.44.0', 'model_type': 'bunny-llama', 'use_mm_proj': True, 'mm_projector_type': 'mlp2x_gelu', 'mm_hidden_size': 1280, 'image_aspect_ratio': 'square', 'tokenizer_padding_side': 'right', 'tokenizer_model_max_length': 2048, 'tune_mm_mlp_adapter': True, 'freeze_mm_mlp_adapter': False, 'mm_projector_lr': None, 'use_s2': False, 'output_dir': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0005, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.03, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain/runs/Jan15_10-32-31_h-liuchonghan-rler1225-a800-a100-2nodes-m-0', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100000, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': True, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': './script/deepspeed/zero2.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'no', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'eval_use_gather_object': False, 'cache_dir': None, 'mpt_attn_impl': 'triton', 'model_max_length': 2048, 'group_by_modality_length': False} +2026-01-15 10:35:10,486 INFO MainThread:383416 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8052289536 - > +2026-01-15 10:35:10,486 INFO MainThread:383416 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 8052289536 None +2026-01-15 19:43:37,951 INFO wandb-AsyncioManager-main:383416 [service_client.py:_forward_responses():80] Reached EOF. +2026-01-15 19:43:37,951 INFO wandb-AsyncioManager-main:383416 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Unicorn_dense/wandb/run-20260115_103508-pagdc54a/run-pagdc54a.wandb b/Unicorn_dense/wandb/run-20260115_103508-pagdc54a/run-pagdc54a.wandb new file mode 100644 index 0000000000000000000000000000000000000000..7649c9fd183281f18252f082bfe39f2c7a41d8dc --- /dev/null +++ b/Unicorn_dense/wandb/run-20260115_103508-pagdc54a/run-pagdc54a.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe5fa601fa8f9f59cf517e35cf15d77679d48ac79e8f44234150f86e1210fe9d +size 16989676 diff --git a/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/files/config.yaml b/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ee92d04a7f0b9c273e83338dc8242d4816e17cd0 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/files/config.yaml @@ -0,0 +1,617 @@ +_name_or_path: + value: /llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct +_wandb: + value: + cli_version: 0.23.1 + e: + mzazmdqp175if22daw2xhicv4zdaa94j: + args: + - --local_rank=0 + - --deepspeed + - ./script/deepspeed/zero2.json + - --model_name_or_path + - /llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct + - --model_type + - llama3-8b + - --version + - plain + - --data_path + - /llm-align/liuchonghan/xiaomin/data/densefusion/densefusion_pretrain_ours.json + - --mm_projector_type + - mlp2x_gelu + - --tune_mm_mlp_adapter + - "True" + - --image_aspect_ratio + - square + - --bf16 + - "True" + - --output_dir + - /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain + - --num_train_epochs + - "1" + - --per_device_train_batch_size + - "8" + - --per_device_eval_batch_size + - "4" + - --gradient_accumulation_steps + - "4" + - --evaluation_strategy + - "no" + - --save_strategy + - steps + - --save_steps + - "100000" + - --save_total_limit + - "1" + - --learning_rate + - "5e-4" + - --weight_decay + - "0." + - --warmup_ratio + - "0.03" + - --lr_scheduler_type + - cosine + - --logging_steps + - "1" + - --tf32 + - "True" + - --model_max_length + - "2048" + - --gradient_checkpointing + - "True" + - --dataloader_num_workers + - "4" + - --lazy_preprocess + - "True" + - --report_to + - wandb + codePath: bunny/train/train.py + codePathLocal: bunny/train/train.py + cpu_count: 128 + cpu_count_logical: 255 + cudaVersion: "12.4" + disk: + /: + total: "53687091200" + used: "15148830720" + email: yuxm02@gmail.com + executable: /llm-align/liuchonghan/env/envs/bunny/bin/python + gpu: NVIDIA A800-SXM4-80GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-a3fc74bf-1f17-5efa-f250-bcbcbcf95ece + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-de894aa2-80ed-5287-811c-51192fa8348e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-3c6bdd46-409d-a93f-e9b7-a124938ce17e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-6e5234d7-e2d4-69d9-3213-cc1239573e6d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-0e81592d-2c75-2476-c37d-e8d1d4fb2d65 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-d2f05526-805c-8dec-ae5f-ff04f3cec8f5 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-b3f69678-17b2-d1eb-c163-f104c5f03d77 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A800-SXM4-80GB + uuid: GPU-9bf33294-cd7e-ffd9-184d-02e4b5d18550 + host: h-liuchonghan-rler1225-a800-a100-2nodes-m-0 + memory: + total: "2164358094848" + os: Linux-5.4.0-42-generic-x86_64-with-glibc2.35 + program: /llm-align/liuchonghan/xiaomin/Unicorn_dense/bunny/train/train.py + python: CPython 3.10.19 + root: /llm-align/liuchonghan/xiaomin/Unicorn_dense + startedAt: "2026-01-15T15:09:08.896016Z" + writerId: mzazmdqp175if22daw2xhicv4zdaa94j + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "2": '*' + "5": 1 + "6": + - 1 + "7": [] + python_version: 3.10.19 + t: + "1": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 63 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 63 + - 71 + - 98 + "3": + - 7 + - 13 + - 19 + - 62 + - 66 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.44.0 + "9": + "1": transformers_trainer + "12": 0.23.1 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +add_cross_attention: + value: false +architectures: + value: + - LlamaForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +bad_words_ids: + value: null +batch_eval_metrics: + value: false +begin_suppress_tokens: + value: null +bf16: + value: true +bf16_full_eval: + value: false +bos_token_id: + value: 128000 +cache_dir: + value: null +chunk_size_feed_forward: + value: 0 +cross_attention_hidden_size: + value: null +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 4 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +decoder_start_token_id: + value: null +deepspeed: + value: ./script/deepspeed/zero2.json +disable_tqdm: + value: false +dispatch_batches: + value: null +diversity_penalty: + value: 0 +do_eval: + value: false +do_predict: + value: false +do_sample: + value: false +do_train: + value: false +early_stopping: + value: false +encoder_no_repeat_ngram_size: + value: 0 +eos_token_id: + value: 128001 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: "no" +eval_use_gather_object: + value: false +evaluation_strategy: + value: "no" +exponential_decay_length_penalty: + value: null +finetuning_task: + value: null +forced_bos_token_id: + value: null +forced_eos_token_id: + value: null +fp16: + value: false +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +freeze_mm_mlp_adapter: + value: false +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +group_by_modality_length: + value: false +half_precision_backend: + value: auto +hidden_act: + value: silu +hidden_size: + value: 4096 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +image_aspect_ratio: + value: square +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +initializer_range: + value: 0.02 +intermediate_size: + value: 14336 +is_decoder: + value: false +is_encoder_decoder: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +learning_rate: + value: 0.0005 +length_column_name: + value: length +length_penalty: + value: 1 +load_best_model_at_end: + value: false +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain/runs/Jan15_23-06-00_h-liuchonghan-rler1225-a800-a100-2nodes-m-0 +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr_scheduler_type: + value: cosine +max_grad_norm: + value: 1 +max_length: + value: 20 +max_position_embeddings: + value: 8192 +max_steps: + value: -1 +metric_for_best_model: + value: null +min_length: + value: 0 +mm_hidden_size: + value: 1280 +mm_projector_lr: + value: null +mm_projector_type: + value: mlp2x_gelu +model/num_parameters: + value: 8052289536 +model_max_length: + value: 2048 +model_type: + value: bunny-llama +mp_parameters: + value: "" +mpt_attn_impl: + value: triton +neftune_noise_alpha: + value: null +no_cuda: + value: false +no_repeat_ngram_size: + value: 0 +num_attention_heads: + value: 32 +num_beam_groups: + value: 1 +num_beams: + value: 1 +num_hidden_layers: + value: 32 +num_key_value_heads: + value: 8 +num_return_sequences: + value: 1 +num_train_epochs: + value: 1 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain +output_hidden_states: + value: false +output_scores: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: null +past_index: + value: -1 +per_device_eval_batch_size: + value: 4 +per_device_train_batch_size: + value: 8 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +prediction_loss_only: + value: false +prefix: + value: null +pretraining_tp: + value: 1 +problem_type: + value: null +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_invalid_values: + value: false +remove_unused_columns: + value: false +repetition_penalty: + value: 1 +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +return_dict_in_generate: + value: false +rms_norm_eps: + value: 1e-05 +rope_scaling: + value: null +rope_theta: + value: 500000 +run_name: + value: /llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: true +save_steps: + value: 100000 +save_strategy: + value: steps +save_total_limit: + value: 1 +seed: + value: 42 +sep_token_id: + value: null +skip_memory_metrics: + value: true +split_batches: + value: null +suppress_tokens: + value: null +task_specific_params: + value: null +temperature: + value: 1 +tf_legacy_loss: + value: false +tf32: + value: true +tie_encoder_decoder: + value: false +tie_word_embeddings: + value: false +tokenizer_class: + value: null +tokenizer_model_max_length: + value: 2048 +tokenizer_padding_side: + value: right +top_k: + value: 50 +top_p: + value: 1 +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_dtype: + value: bfloat16 +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +torchscript: + value: false +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +transformers_version: + value: 4.44.0 +tune_mm_mlp_adapter: + value: true +typical_p: + value: 1 +use_bfloat16: + value: false +use_cache: + value: false +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_mm_proj: + value: true +use_mps_device: + value: false +use_s2: + value: false +vocab_size: + value: 128256 +warmup_ratio: + value: 0.03 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/files/output.log b/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..22854c339b2973157c264c8c491cebf66b9f136c --- /dev/null +++ b/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/files/output.log @@ -0,0 +1,3910 @@ + 0%| | 0/3906 [00:00\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-16T01:01:02.736252069+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/tzltcduc/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-16T01:50:41.805241062+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/tzltcduc/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-16T02:31:11.489117871+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/tzltcduc/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-16T05:47:50.424144962+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/xiaomin02/huggingface/tzltcduc/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-01-16T08:17:31.77758427+08:00","level":"INFO","msg":"stream: closing","id":"tzltcduc"} +{"time":"2026-01-16T08:17:34.163717521+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-01-16T08:17:34.848919185+08:00","level":"INFO","msg":"handler: closed","stream_id":"tzltcduc"} +{"time":"2026-01-16T08:17:34.849133266+08:00","level":"INFO","msg":"sender: closed","stream_id":"tzltcduc"} +{"time":"2026-01-16T08:17:34.84916337+08:00","level":"INFO","msg":"stream: closed","id":"tzltcduc"} diff --git a/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/logs/debug.log b/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..b08e57f53a0cd2ba6a32dccb3c226ce02bfb41e3 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/logs/debug.log @@ -0,0 +1,26 @@ +2026-01-15 23:09:08,904 INFO MainThread:443684 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2026-01-15 23:09:08,904 INFO MainThread:443684 [wandb_setup.py:_flush():80] Configure stats pid to 443684 +2026-01-15 23:09:08,904 INFO MainThread:443684 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2026-01-15 23:09:08,904 INFO MainThread:443684 [wandb_setup.py:_flush():80] Loading settings from /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/settings +2026-01-15 23:09:08,904 INFO MainThread:443684 [wandb_setup.py:_flush():80] Loading settings from environment variables +2026-01-15 23:09:08,904 INFO MainThread:443684 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/logs/debug.log +2026-01-15 23:09:08,905 INFO MainThread:443684 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /llm-align/liuchonghan/xiaomin/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/logs/debug-internal.log +2026-01-15 23:09:08,905 INFO MainThread:443684 [wandb_init.py:init():841] calling init triggers +2026-01-15 23:09:08,905 INFO MainThread:443684 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-01-15 23:09:08,905 INFO MainThread:443684 [wandb_init.py:init():889] starting backend +2026-01-15 23:09:09,343 INFO MainThread:443684 [wandb_init.py:init():892] sending inform_init request +2026-01-15 23:09:09,356 INFO MainThread:443684 [wandb_init.py:init():900] backend started and connected +2026-01-15 23:09:09,358 INFO MainThread:443684 [wandb_init.py:init():970] updated telemetry +2026-01-15 23:09:09,359 INFO MainThread:443684 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2026-01-15 23:09:10,667 INFO MainThread:443684 [wandb_init.py:init():1041] starting run threads in backend +2026-01-15 23:09:10,942 INFO MainThread:443684 [wandb_run.py:_console_start():2521] atexit reg +2026-01-15 23:09:10,942 INFO MainThread:443684 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2026-01-15 23:09:10,942 INFO MainThread:443684 [wandb_run.py:_redirect():2438] Wrapping output streams. +2026-01-15 23:09:10,943 INFO MainThread:443684 [wandb_run.py:_redirect():2461] Redirects installed. +2026-01-15 23:09:10,950 INFO MainThread:443684 [wandb_init.py:init():1081] run started, returning control to user process +2026-01-15 23:09:10,953 INFO MainThread:443684 [wandb_run.py:_config_callback():1396] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 8192, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 500000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '/llm-align/liuchonghan/xiaomin/model/Meta-Llama-3-8B-Instruct', 'transformers_version': '4.44.0', 'model_type': 'bunny-llama', 'use_mm_proj': True, 'mm_projector_type': 'mlp2x_gelu', 'mm_hidden_size': 1280, 'image_aspect_ratio': 'square', 'tokenizer_padding_side': 'right', 'tokenizer_model_max_length': 2048, 'tune_mm_mlp_adapter': True, 'freeze_mm_mlp_adapter': False, 'mm_projector_lr': None, 'use_s2': False, 'output_dir': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0005, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.03, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain/runs/Jan15_23-06-00_h-liuchonghan-rler1225-a800-a100-2nodes-m-0', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100000, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': True, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/llm-align/liuchonghan/xiaomin/checkpoints-pretrain-densefusion/densefusion-llama3-8b-pretrain', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': './script/deepspeed/zero2.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'no', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'eval_use_gather_object': False, 'cache_dir': None, 'mpt_attn_impl': 'triton', 'model_max_length': 2048, 'group_by_modality_length': False} +2026-01-15 23:09:10,972 INFO MainThread:443684 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8052289536 - > +2026-01-15 23:09:10,972 INFO MainThread:443684 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 8052289536 None +2026-01-16 08:17:31,777 INFO wandb-AsyncioManager-main:443684 [service_client.py:_forward_responses():80] Reached EOF. +2026-01-16 08:17:31,777 INFO wandb-AsyncioManager-main:443684 [mailbox.py:close():137] Closing mailbox, abandoning 2 handles. diff --git a/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/run-tzltcduc.wandb b/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/run-tzltcduc.wandb new file mode 100644 index 0000000000000000000000000000000000000000..2e8d1cb9a36771a00e3469b0f69b3670265e79d9 --- /dev/null +++ b/Unicorn_dense/wandb/run-20260115_230908-tzltcduc/run-tzltcduc.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:309d0ef38fa559a5d92b33732aeea019a20bfe95a8886a7c369704b974e5c204 +size 16994397