ccclemenfff commited on Jul 19, 2025

Commit

cf932d8

1 Parent(s): 9e99e54

Add model code supports

Browse files

Files changed (38) hide show

.idea/.gitignore +8 -0
.idea/embodied_explainer.iml +12 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
handler.py +52 -0
inference.py +415 -0
requirements.txt +13 -0
robohusky/.DS_Store +0 -0
robohusky/base_dataset.py +226 -0
robohusky/base_dataset_uni.py +434 -0
robohusky/compression.py +230 -0
robohusky/configuration_husky.py +326 -0
robohusky/constants.py +47 -0
robohusky/conversation.py +511 -0
robohusky/convert_fp16.py +27 -0
robohusky/convert_husky_fp16.py +28 -0
robohusky/convert_reward_fp16.py +27 -0
robohusky/dist_utils.py +100 -0
robohusky/llama2_flash_attn_monkey_patch.py +232 -0
robohusky/model/__init__.py +70 -0
robohusky/model/__pycache__/__init__.cpython-38.pyc +0 -0
robohusky/model/__pycache__/configuration_husky.cpython-38.pyc +0 -0
robohusky/model/__pycache__/modeling_husky_embody2.cpython-38.pyc +0 -0
robohusky/model/compression.py +0 -0
robohusky/model/configuration_husky.py +331 -0
robohusky/model/configuration_husky_ori.py +327 -0
robohusky/model/modeling_husky.py +1820 -0
robohusky/model/modeling_husky_embody2.py +1962 -0
robohusky/model/modeling_husky_embody2_ori.py +1821 -0
robohusky/model/processing_husky.py +178 -0
robohusky/train/.DS_Store +0 -0
robohusky/train/llama_flash_attn_monkey_patch.py +232 -0
robohusky/train/llama_rmsnorm_monkey_patch.py +15 -0
robohusky/train/train.py +597 -0
robohusky/train/train_uni.py +603 -0
robohusky/utils.py +238 -0
robohusky/video_transformers.py +406 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# 默认忽略的文件
+/shelf/
+/workspace.xml
+# 基于编辑器的 HTTP 客户端请求
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/embodied_explainer.iml ADDED Viewed

	@@ -0,0 +1,12 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="GOOGLE" />
+    <option name="myDocStringFormat" value="Google" />
+  </component>
+</module>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/embodied_explainer.iml" filepath="$PROJECT_DIR$/.idea/embodied_explainer.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

handler.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os
+from typing import Dict, Any
+from PIL import Image
+from io import BytesIO
+from inference import Chat  # 直接import你放的inference.py里Chat类
+from robohusky.conversation import get_conv_template
+class EndpointHandler:
+    def __init__(self, path: str = "."):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.chat = Chat(
+            model_path=path,
+            device=self.device,
+            num_gpus=1,
+            max_new_tokens=1024,
+            load_8bit=False
+        )
+        self.vision_feature = None
+        self.modal_type = "text"
+        self.conv = get_conv_template("husky").copy()
+    def preprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        query = inputs.get("inputs", "")
+        self.conv = get_conv_template("husky").copy()
+        self.vision_feature = None
+        self.modal_type = "text"
+        if "image" in inputs:
+            image_bytes = inputs["image"]
+            image = Image.open(BytesIO(image_bytes)).convert("RGB")
+            image.save("temp.jpg")
+            self.vision_feature = self.chat.get_image_embedding("temp.jpg")
+            self.modal_type = "image"
+        elif "video" in inputs:
+            video_bytes = inputs["video"]
+            with open("temp.mp4", "wb") as f:
+                f.write(video_bytes)
+            self.vision_feature = self.chat.get_video_embedding("temp.mp4")
+            self.modal_type = "video"
+        return {"query": query}
+    def __call__(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+        processed = self.preprocess(inputs)
+        query = processed["query"]
+        conversations = self.chat.ask(text=query, conv=self.conv, modal_type=self.modal_type)
+        outputs = self.chat.answer(conversations, self.vision_feature, modal_type=self.modal_type)
+        self.conv.messages[-1][1] = outputs.strip()
+        return {"output": outputs.strip()}

inference.py ADDED Viewed

	@@ -0,0 +1,415 @@

+"""
+srun -p INTERN2 --job-name='husky_multi_test' --gres=gpu:1 --cpus-per-task=8 --quotatype="auto" python -u demo/inference_new.py
+"""
+import abc
+from typing import Optional
+import os
+import requests
+from PIL import Image
+from io import BytesIO
+import torch
+import torchvision.transforms as T
+from peft import PeftModel
+from torchvision.transforms.functional import InterpolationMode
+from transformers import (
+    LlamaTokenizer,
+    GenerationConfig,
+    StoppingCriteria,
+    StoppingCriteriaList,
+)
+from robohusky.model.modeling_husky_embody2 import HuskyForConditionalGeneration
+from robohusky.conversation import (
+    conv_templates,
+    get_conv_template,
+)
+from robohusky.video_transformers import (
+    GroupNormalize,
+    GroupScale,
+    GroupCenterCrop,
+    Stack,
+    ToTorchFormatTensor,
+    get_index,
+)
+from robohusky.compression import compress_module
+from decord import VideoReader, cpu
+# import deepspeed
+IGNORE_INDEX = -100
+DEFAULT_UNK_TOKEN = "<unk>"
+DEFAULT_IMG_START_TOKEN = "<img>"
+DEFAULT_IMG_END_TOKEN = "</img>"
+DEFAULT_VIDEO_START_TOKEN = "<vid>"
+DEFAULT_VIDEO_END_TOKEN = "</vid>"
+def get_gpu_memory(max_gpus=None):
+    gpu_memory = []
+    num_gpus = (
+        torch.cuda.device_count()
+        if max_gpus is None
+        else min(max_gpus, torch.cuda.device_count())
+    )
+    for gpu_id in range(num_gpus):
+        with torch.cuda.device(gpu_id):
+            device = torch.cuda.current_device()
+            gpu_properties = torch.cuda.get_device_properties(device)
+            total_memory = gpu_properties.total_memory / (1024 ** 3)
+            allocated_memory = torch.cuda.memory_allocated() / (1024 ** 3)
+            available_memory = total_memory - allocated_memory
+            gpu_memory.append(available_memory)
+    return gpu_memory
+def load_model(
+        model_path, device, num_gpus, max_gpu_memory=None, load_8bit=False, lora_weights=None
+):
+    if device == "cpu":
+        kwargs = {}
+    elif device == "cuda":
+        kwargs = {"torch_dtype": torch.float16}
+        if num_gpus == "auto":
+            kwargs["device_map"] = "auto"
+        else:
+            num_gpus = int(num_gpus)
+            if num_gpus != 1:
+                kwargs["device_map"] = "auto"
+                if max_gpu_memory is None:
+                    kwargs[
+                        "device_map"
+                    ] = "sequential"  # This is important for not the same VRAM sizes
+                    available_gpu_memory = get_gpu_memory(num_gpus)
+                    kwargs["max_memory"] = {
+                        i: str(int(available_gpu_memory[i] * 0.85)) + "GiB"
+                        for i in range(num_gpus)
+                    }
+                else:
+                    kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)}
+    else:
+        raise ValueError(f"Invalid device: {device}")
+    tokenizer = LlamaTokenizer.from_pretrained(
+        model_path, use_fast=False)
+    if lora_weights is None:
+        model = HuskyForConditionalGeneration.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **kwargs
+        )
+    else:
+        kwargs["device_map"] = "auto"
+        model = HuskyForConditionalGeneration.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **kwargs
+        )
+        model.language_model = PeftModel.from_pretrained(
+            model.language_model,
+            lora_weights,
+            **kwargs
+        )
+    if load_8bit:
+        compress_module(model, device)
+    if (device == "cuda" and num_gpus == 1) or device == "mps":
+        model.to(device)
+    model = model.eval()
+    return model, tokenizer
+def load_image(image_file, input_size=224):
+    if image_file.startswith('http') or image_file.startswith('https'):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert('RGB')
+    else:
+        image = Image.open(image_file).convert('RGB')
+    crop_pct = 224 / 256
+    size = int(input_size / crop_pct)
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize(size, interpolation=InterpolationMode.BICUBIC),
+        T.CenterCrop(input_size),
+        T.ToTensor(),
+        T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
+    ])
+    image = transform(image)
+    return image
+def load_video(video_path, num_segments=8):
+    vr = VideoReader(video_path, ctx=cpu(0))
+    num_frames = len(vr)
+    frame_indices = get_index(num_frames, num_segments)
+    # transform
+    crop_size = 224
+    scale_size = 224
+    input_mean = [0.48145466, 0.4578275, 0.40821073]
+    input_std = [0.26862954, 0.26130258, 0.27577711]
+    transform = T.Compose([
+        GroupScale(int(scale_size), interpolation=InterpolationMode.BICUBIC),
+        GroupCenterCrop(crop_size),
+        Stack(),
+        ToTorchFormatTensor(),
+        GroupNormalize(input_mean, input_std)
+    ])
+    images_group = list()
+    for frame_index in frame_indices:
+        img = Image.fromarray(vr[frame_index].asnumpy())
+        images_group.append(img)
+    video = transform(images_group)
+    return video
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops, encounters=1):
+        super().__init__()
+        self.stops = stops
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
+        for stop in self.stops:
+            if torch.all((stop == input_ids[0][-len(stop):])).item():
+                return True
+        return False
+@torch.inference_mode()
+def generate_stream(
+        model, tokenizer, image_processor, params, device
+):
+    prompt = params["prompt"]
+    images = params.get("images", None)
+    videos = params.get("videos", None)
+    temperature = float(params.get("temperature", 0.7))
+    max_new_tokens = int(params.get("max_new_tokens", 1024))
+    num_queries = model.config.num_query_tokens
+    stop_words = ["Human: ", "Assistant: ", "###", "\n\n"]
+    stop_words_ids = [tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
+    stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+    generation_config = GenerationConfig(
+        bos_token_id=1,
+        do_sample=True,
+        temperature=temperature,
+        max_new_tokens=max_new_tokens,
+        stopping_criteria=stopping_criteria
+    )
+    pixel_values = None
+    if images is not None:
+        pixel_values = load_image(images).to(device)  # only support one image
+        image_query = DEFAULT_IMG_START_TOKEN + DEFAULT_IMG_END_TOKEN
+        prompt = prompt.replace("<image>", image_query)
+    elif videos is not None:
+        pixel_values = load_video(videos).to(device)
+        video_query = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_END_TOKEN
+        prompt = prompt.replace("<video>", video_query)
+    model_inputs = tokenizer([prompt], return_tensors="pt")
+    model_inputs.pop("token_type_ids", None)
+    if pixel_values is not None:
+        model_inputs["pixel_values"] = pixel_values
+        generation_output = model.generate(
+            **model_inputs,
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+            output_scores=True
+        )
+    else:
+        generation_output = model.language_model.generate(
+            **model_inputs,
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+            output_scores=True
+        )
+    preds = generation_output.sequences
+    outputs = tokenizer.batch_decode(preds, skip_special_tokens=True)
+    return outputs
+class Chat:
+    def __init__(
+            self,
+            model_path,
+            device,
+            num_gpus=1,
+            load_8bit=False,
+            temperature=0.7,
+            max_new_tokens=512,
+            lora_path=None,
+    ):
+        model, tokenizer = load_model(
+            model_path, device, num_gpus, load_8bit=load_8bit, lora_weights=lora_path
+        )
+        self.model = model
+        # self.model.language_model = deepspeed.init_inference(
+        #     self.model.language_model, mp_size=1, dtype=torch.float16, checkpoint=None, replace_with_kernel_inject=True)
+        self.tokenizer = tokenizer
+        num_queries = model.config.num_query_tokens
+        self.device = device
+        self.dtype = model.dtype
+        stop_words = ["Human: ", "Assistant: ", "###", "\n\n"]
+        stop_words_ids = [tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
+        stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+        self.conv = get_conv_template("husky")
+        self.image_query = DEFAULT_IMG_START_TOKEN + DEFAULT_IMG_END_TOKEN
+        self.video_query = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_END_TOKEN
+        self.generation_config = GenerationConfig(
+            bos_token_id=1,
+            do_sample=True,
+            top_k=20,
+            top_p=0.9,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            stopping_criteria=stopping_criteria
+        )
+    def ask(self, text, conv, modal_type="image"):
+        assert modal_type in ["text", "image", "video"]
+        conversations = []
+        if len(conv.messages) > 0 or modal_type == "text":
+            conv.append_message(conv.roles[0], text)
+        elif modal_type == "image":
+            conv.append_message(conv.roles[0], self.image_query + "\n" + text)
+        else:
+            conv.append_message(conv.roles[0], self.video_query + "\n" + text)
+        conv.append_message(conv.roles[1], None)
+        conversations.append(conv.get_prompt())
+        return conversations
+    @torch.no_grad()
+    def get_image_embedding(self, image_file):
+        pixel_values = load_image(image_file)
+        pixel_values = pixel_values.unsqueeze(0).to(self.device, dtype=self.dtype)
+        language_model_inputs = self.model.extract_feature(pixel_values)
+        return language_model_inputs
+    @torch.no_grad()
+    def get_video_embedding(self, video_file):
+        pixel_values = load_video(video_file)
+        TC, H, W = pixel_values.shape
+        pixel_values = pixel_values.reshape(TC // 3, 3, H, W).transpose(0, 1)  # [C, T, H, W]
+        pixel_values = pixel_values.unsqueeze(0).to(self.device, dtype=self.dtype)
+        assert len(pixel_values.shape) == 5
+        language_model_inputs = self.model.extract_feature(pixel_values)
+        return language_model_inputs
+    @torch.no_grad()
+    def answer(self, conversations, language_model_inputs, modal_type="image"):
+        model_inputs = self.tokenizer(
+            conversations,
+            return_tensors="pt",
+        )
+        model_inputs.pop("token_type_ids", None)
+        input_ids = model_inputs["input_ids"].to(self.device)
+        attention_mask = model_inputs["attention_mask"].to(self.device)
+        if modal_type == "text":
+            generation_output = self.model.language_model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                generation_config=self.generation_config,
+                return_dict_in_generate=True,
+                output_scores=True
+            )
+        else:
+            pixel_values = model_inputs.pop("pixel_values", None)
+            if pixel_values is not None:
+                pixel_values = pixel_values.to(self.device)
+            generation_output = self.model.generate(
+                pixel_values=pixel_values,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                language_model_inputs=language_model_inputs,
+                generation_config=self.generation_config,
+                return_dict_in_generate=True,
+                output_scores=True
+            )
+        preds = generation_output.sequences
+        outputs = self.tokenizer.batch_decode(preds, skip_special_tokens=True)[0]
+        if modal_type == "text":
+            skip_echo_len = len(conversations[0]) - conversations[0].count("</s>") * 3
+            outputs = outputs[skip_echo_len:].strip()
+        return outputs
+if __name__ == '__main__':
+    # model_path = "/mnt/petrelfs/zhangqinglong/Documents/Husky/work_dirs/husky_v3/EmbodiedGPT/pretrain_0727"
+    model_path = "/mnt/petrelfs/share_data/gvembodied/workdirs/align_new_myyf"
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    chat = Chat(model_path, device=device, num_gpus=1, max_new_tokens=1024, load_8bit=False)
+    vision_feature = None
+    image_state = False
+    video_state = False
+    while True:
+        query = input("\n")
+        if query.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')):
+            if os.path.exists(query):
+                print("received.")
+                vision_feature = chat.get_image_embedding(query)
+                chat.conv = get_conv_template("husky").copy()
+                image_state = True
+                continue
+        if query.lower().endswith(('.mp4', '.mkv', '.avi', '.wmv', '.iso', ".webm")):
+            if os.path.exists(query):
+                print("received.")
+                vision_feature = chat.get_video_embedding(query)
+                chat.conv = get_conv_template("husky").copy()
+                video_state = True
+                continue
+        if query == "stop":
+            break
+        if query == "clear" or query == "" or query == "\n":
+            chat.conv = get_conv_template("husky").copy()
+            image_state = False
+            video_state = False
+            os.system("clear")
+            print("欢迎使用 husky-13b-zh 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
+            continue
+        if image_state:
+            modal_type = "image"
+        elif video_state:
+            modal_type = "video"
+        else:
+            modal_type = "text"
+        # image_test = "assets/husky.jpg"
+        # image_test = "assets/yoga.mp4"
+        # video_test = "assets/pretty_girl.mp4"
+        # video_test = "assets/stock-footage-billiards-concentrated-young-woman-playing-in-club.webm"
+        # video_test = "assets/stock-footage-kherson-ukraine-may-open-free-rock-music-festival-crowd-partying-at-a-rock-concert.webm"
+        conversations = chat.ask(text=query, conv=chat.conv, modal_type=modal_type)
+        outputs = chat.answer(conversations, vision_feature, modal_type=modal_type)
+        # NOTE: strip is important to align with the training data.
+        chat.conv.messages[-1][1] = outputs.strip()
+        print(f"Husky: \n{outputs}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+torch==2.0.1
+torchvision==0.15.2
+torchaudio==2.0.2
+transformers==4.34.1
+decord
+peft
+huggingface_hub
+Pillow
+einops
+scipy
+numpy
+tqdm
+flash-attn

robohusky/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

robohusky/base_dataset.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import os
+import random
+from typing import Dict, Optional, Sequence
+from PIL import PngImagePlugin, Image, ImageFile
+import torch
+from torch.utils.data import Dataset
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+from robohusky.train.tcsloader import TCSLoader
+from robohusky.conversation import get_conv_template
+IGNORE_INDEX = -100
+Image.MAX_IMAGE_PIXELS = None
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+MaximumDecompressedSize = 1024
+MegaByte = 2 ** 20
+PngImagePlugin.MAX_TEXT_CHUNK = MaximumDecompressedSize * MegaByte
+DEFAULT_IMG_START_TOKEN = "<img>"
+DEFAULT_IMG_END_TOKEN = "</img>"
+DEFAULT_VIDEO_START_TOKEN = "<vid>"
+DEFAULT_VIDEO_END_TOKEN = "</vid>"
+def is_image(image_file):
+    if image_file.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')):
+        return True
+    else:
+        return False
+def is_video(image_file):
+    if image_file.lower().endswith(('.mp4', '.mkv', '.avi', '.wmv', '.iso', ".webm")):
+        return True
+    else:
+        return False
+def build_transform(input_size):
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
+    ])
+    return transform
+def format_inputs(sources):
+    # Apply prompt templates
+    conv = get_conv_template("husky").copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            # vision is only supported for the human input
+            if role == conv.roles[0]:
+                value = sentence["value"]
+                if "<image>" in value:
+                    if value.endswith("\n<image>"):
+                        value = "<image>\n" + value.replace("\n<image>", "")
+                    image_query = DEFAULT_IMG_START_TOKEN + DEFAULT_IMG_END_TOKEN
+                    sentence["value"] = value.replace("<image>", image_query)
+                elif "<video>" in value:
+                    if value.endswith("\n<video>"):
+                        value = "<video>\n" + value.replace("\n<video>", "")
+                    video_query = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_END_TOKEN
+                    sentence["value"] = value.replace("<video>", video_query)
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    return conversations, conv
+def process_func(examples, tokenizer, max_seq_length):
+    conversations, conv = format_inputs(examples['conversations'])
+    model_inputs = tokenizer(
+        conversations,
+        max_length=max_seq_length,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+    )
+    model_inputs.pop("token_type_ids", None)
+    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+    # padding in the loss.
+    targets = model_inputs["input_ids"].clone()
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        turns = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, turn in enumerate(turns):
+            if turn == "":
+                break
+            turn_len = len(tokenizer(turn).input_ids)
+            parts = turn.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            # "-2" is hardcoded for the Llama tokenizer to make the offset correct.
+            instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            if i != 0 and not tokenizer.legacy:
+                # The legacy and non-legacy modes handle special tokens differently
+                instruction_len -= 1
+            # Ignore the user instructions
+            target[cur_len: cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += turn_len
+            if i != 0 and not tokenizer.legacy:
+                # The legacy and non-legacy modes handle special tokens differently
+                cur_len -= 1
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+    model_inputs["labels"] = targets
+    return model_inputs
+class BaseDataset(Dataset):
+    def __init__(self, dataset, processor, image_path="", input_size=224):
+        super(BaseDataset, self).__init__()
+        self.dataset = dataset
+        self.image_path = image_path
+        self.transform = build_transform(input_size)
+        self.husky_processor = processor
+        self.cached_data_dict = {}
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        if i in self.cached_data_dict:
+            return self.cached_data_dict[i]
+        data = self.dataset[i]
+        image_file = data.pop("image", None)
+        if self.image_path != "":
+            image_file = os.path.join(self.image_path, image_file)
+            if not os.path.exists(image_file):
+                return self.__getitem__((i + 1) % len(self.dataset))
+            image = Image.open(image_file)
+        else:
+            image = Image.open(image_file)
+        for k, v in data.items():
+            data[k] = [v]
+        ret = self.husky_processor(data)
+        for k, v in ret.items():
+            ret[k] = v[0]
+        pixel_values = self.transform(image)
+        ret["pixel_values"] = pixel_values
+        self.cached_data_dict[i] = ret
+        return ret
+class CephDataset(Dataset):
+    def __init__(self, dataset, processor, input_size=224):
+        super(CephDataset, self).__init__()
+        self.dataset = dataset
+        self.transform = build_transform(input_size)
+        self.husky_processor = processor
+        conf_path = "./petrelf.conf"
+        self.conf_path = os.path.abspath(conf_path)
+        self.initialized = False
+        self._init_memcached()
+    def _init_memcached(self):
+        if not self.initialized:
+            assert self.conf_path is not None
+            self.mt_loader = TCSLoader(self.conf_path)
+            self.initialized = True
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        data = self.dataset[i]
+        image_file = data.pop("image", None)
+        try:
+            image = self.mt_loader(image_file).convert('RGB')
+        except (AttributeError, OSError):
+            with open("error.txt", 'a') as f:
+                f.write(image_file + '\n')
+            i = random.randint(0, len(self.dataset))
+            return self.__getitem__(i % len(self.dataset))
+        for k, v in data.items():
+            data[k] = [v]
+        ret = self.husky_processor(data)
+        for k, v in ret.items():
+            ret[k] = v[0]
+        pixel_values = self.transform(image)
+        ret["pixel_values"] = pixel_values
+        return ret

robohusky/base_dataset_uni.py ADDED Viewed

	@@ -0,0 +1,434 @@

+import os
+import random
+from typing import Dict, Optional, Sequence, Iterator, List, Iterable, Union
+from PIL import PngImagePlugin, Image, ImageFile, ImageOps
+import numpy as np
+import torch
+from torch.utils.data import (
+    Dataset,
+    ConcatDataset,
+    Sampler,
+    WeightedRandomSampler
+)
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+from robohusky.train.tcsloader import TCSLoader
+from decord import VideoReader, cpu
+from robohusky.video_transformers import (
+    GroupNormalize,
+    GroupScale,
+    GroupCenterCrop,
+    Stack,
+    ToTorchFormatTensor,
+    get_index,
+)
+from robohusky.conversation import get_conv_template
+IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
+IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5]
+IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]
+OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
+OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
+IGNORE_INDEX = -100
+Image.MAX_IMAGE_PIXELS = None
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+MaximumDecompressedSize = 1024
+MegaByte = 2 ** 20
+PngImagePlugin.MAX_TEXT_CHUNK = MaximumDecompressedSize * MegaByte
+DEFAULT_IMG_START_TOKEN = "<img>"
+DEFAULT_IMG_END_TOKEN = "</img>"
+DEFAULT_VIDEO_START_TOKEN = "<vid>"
+DEFAULT_VIDEO_END_TOKEN = "</vid>"
+DEFAULT_EMBED_TOKEN = "<quad>"
+conf_path = "/your path to/petrelf.conf"
+def is_image(image_file):
+    if image_file.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')):
+        return True
+    else:
+        return False
+def is_video(image_file):
+    if image_file.lower().endswith(('.mp4', '.mkv', '.avi', '.wmv', '.iso', ".webm")):
+        return True
+    else:
+        return False
+def is_numpy(image_file):
+    if image_file.endswith(".npy"):
+        return True
+    else:
+        return False
+def get_media_type(image_file):
+    if is_image(image_file):
+        return "image"
+    elif is_video(image_file):
+        return "video"
+    elif is_numpy(image_file):
+        return "numpy"
+    else:
+        return "text"
+def build_transform(input_size, norm_type="openai", media_type="image"):
+    if norm_type == "openai":
+        mean = OPENAI_CLIP_MEAN
+        std = OPENAI_CLIP_STD
+    elif norm_type == "imagenet":
+        mean = IMAGENET_DEFAULT_MEAN
+        std = IMAGENET_DEFAULT_STD
+    else:
+        mean = IMAGENET_DEFAULT_MEAN
+        std = IMAGENET_DEFAULT_STD
+    if media_type == "image":
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=mean, std=std)
+        ])
+    elif media_type == "video":
+        transform = T.Compose([
+            GroupScale(int(input_size), interpolation=InterpolationMode.BICUBIC),
+            GroupCenterCrop(input_size),
+            Stack(),
+            ToTorchFormatTensor(),
+            GroupNormalize(mean=mean, std=std)
+        ])
+    else:
+        transform = None
+    return transform
+def check_format(data):
+    if not ('id' in data and 'image' in data and 'conversations' in data and len(data['conversations']) % 2 == 0):
+        print(f"Lake field: {data}")
+        return False
+    for i, message in enumerate(data['conversations']):
+        if i == 0:
+            if not (message['value'].startswith("<image>\n") or message['value'].endswith("\n<image>")):
+                print(f"No <image>: {data}")
+                return False
+        if i % 2 == 0:
+            if not (message['from'] == 'human'):
+                print(f"Not from human: {data}")
+                return False
+        else:
+            if not (message['from'] == 'gpt'):
+                print(f"Not from gpt: {data}")
+                return False
+        if message['value'] is None or (len(message['value']) == 0):
+            print(f"No Message: {data}")
+            return False
+    return True
+def format_inputs(sources, conv_tempt="husky", num_query_tokens=256):
+    # Apply prompt templates
+    conv = get_conv_template(conv_tempt).copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            # vision is only supported for the human input
+            if role == conv.roles[0]:
+                value = sentence["value"]
+                if "<image>" in value:
+                    if value.endswith("\n<image>"):
+                        value = "<image>\n" + value.replace("\n<image>", "")
+                    image_query = DEFAULT_IMG_START_TOKEN + num_query_tokens * DEFAULT_EMBED_TOKEN + DEFAULT_IMG_END_TOKEN
+                    sentence["value"] = value.replace("<image>", image_query)
+                elif "<video>" in value:
+                    if value.endswith("\n<video>"):
+                        value = "<video>\n" + value.replace("\n<video>", "")
+                    video_query = DEFAULT_VIDEO_START_TOKEN + num_query_tokens * DEFAULT_EMBED_TOKEN + DEFAULT_VIDEO_END_TOKEN
+                    sentence["value"] = value.replace("<video>", video_query)
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    return conversations, conv
+def process_func(examples, tokenizer, max_seq_length=-1, conv_tempt="husky", num_query_tokens=256):
+    conversations, conv = format_inputs(examples['conversations'], conv_tempt, num_query_tokens)
+    if max_seq_length < 0:
+        model_inputs = tokenizer(
+            conversations,
+            return_tensors="pt",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        )
+    else:
+        model_inputs = tokenizer(
+            conversations,
+            max_length=max_seq_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+    model_inputs.pop("token_type_ids", None)
+    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+    # padding in the loss.
+    targets = model_inputs["input_ids"].clone()
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        turns = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, turn in enumerate(turns):
+            if turn == "":
+                break
+            turn_len = len(tokenizer(turn).input_ids)
+            parts = turn.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            # "-2" is hardcoded for the Llama tokenizer to make the offset correct.
+            instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            if i != 0 and not tokenizer.legacy:
+                # The legacy and non-legacy modes handle special tokens differently
+                instruction_len -= 1
+            # Ignore the user instructions
+            target[cur_len: cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += turn_len
+            if i != 0 and not tokenizer.legacy:
+                # The legacy and non-legacy modes handle special tokens differently
+                cur_len -= 1
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+    model_inputs["labels"] = targets
+    return model_inputs
+class BaseDataset(Dataset):
+    def __init__(
+            self,
+            dataset,
+            processor,
+            image_path="",
+            input_size=224,
+            num_segments=8,
+            norm_type="openai",
+            media_type="image"
+    ):
+        super(BaseDataset, self).__init__()
+        self.dataset = dataset
+        self.image_path = image_path
+        self.input_size = input_size
+        self.num_segments = num_segments
+        self.media_type = media_type
+        self.transform = build_transform(input_size, norm_type, media_type)
+        self.husky_processor = processor
+        self.tcs_loader = TCSLoader(os.path.abspath(conf_path), media_type=media_type)
+        self.cached_data_dict = {}
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        if i in self.cached_data_dict:
+            return self.cached_data_dict[i]
+        data = self.dataset[i]
+        image_file = data["image"] if "image" in data else data["video"]
+        if self.media_type == "llm" or image_file == "":
+            # Pseudo pixel_values
+            # pixel_values = torch.zeros(size=(3, self.input_size, self.input_size))
+            pixel_values = None
+        else:
+            if self.image_path != "":
+                image_file = os.path.join(self.image_path, image_file)
+            if "s3://" not in image_file and not os.path.exists(image_file):
+                i = random.randint(0, len(self.dataset))
+                return self.__getitem__(i % len(self.dataset))
+            try:
+                if self.media_type == "image":
+                    # load from ceph
+                    if "s3://" in image_file:
+                        image = self.tcs_loader(image_file)
+                    else:
+                        image = Image.open(image_file).convert('RGB')
+                    # process image with extreme aspect ratios
+                    height, width = image.size
+                    if height / width >= 1.8:
+                        delta = height - width
+                        padding = (0, delta // 2, 0, delta - delta // 2)
+                        image = ImageOps.expand(image, padding)
+                    elif height / width <= 0.56:
+                        delta = width - height
+                        padding = (delta // 2, 0, delta - delta // 2, 0)
+                        image = ImageOps.expand(image, padding)
+                    pixel_values = self.transform(image)
+                elif self.media_type == "video":
+                    if "s3://" in image_file:
+                        vr = self.tcs_loader(image_file)
+                    else:
+                        vr = VideoReader(image_file, ctx=cpu(0))
+                    num_frames = len(vr)
+                    frame_indices = get_index(num_frames, self.num_segments)
+                    images_group = list()
+                    for frame_index in frame_indices:
+                        img = Image.fromarray(vr[frame_index].asnumpy())
+                        images_group.append(img)
+                    pixel_values = self.transform(images_group)
+                    TC, H, W = pixel_values.shape
+                    pixel_values = pixel_values.reshape(TC // 3, 3, H, W).transpose(0, 1)  # [C, T, H, W]
+                else:
+                    # load numpy
+                    if "s3://" in image_file:
+                        pixel_values = self.tcs_loader(image_file)
+                    else:
+                        pixel_values = np.load(image_file)
+                    pixel_values = torch.tensor(pixel_values).transpose(0, 1)
+            except (AttributeError, OSError):
+                with open("error.txt", 'a') as f:
+                    f.write(image_file + '\n')
+                i = random.randint(0, len(self.dataset))
+                return self.__getitem__(i % len(self.dataset))
+        for k, v in data.items():
+            data[k] = [v]
+        ret = self.husky_processor(data)
+        for k, v in ret.items():
+            ret[k] = v[0]
+        if pixel_values is not None:
+            ret["pixel_values"] = pixel_values
+        self.cached_data_dict[i] = ret
+        return ret
+class WeightedConcatDataset(ConcatDataset):
+    def __init__(
+            self,
+            datasets: List[Dataset],
+            weights: Sequence[float] = None,
+            replacement: bool = True,
+            batch_size: int = -1,
+            generator=None
+    ) -> None:
+        super().__init__(datasets)
+        if weights is None:
+            weights = [1.0] * len(self.datasets)
+        weights_tensor = torch.as_tensor(weights, dtype=torch.double)
+        if len(weights_tensor.shape) != 1:
+            raise ValueError("weights should be a 1d sequence but given "
+                             "weights have shape {}".format(tuple(weights_tensor.shape)))
+        self.weights = weights_tensor
+        self.batch_size = batch_size
+        self.replacement = replacement
+        self.generator = generator
+        if self.batch_size <= 0:
+            self.num_samples = sum([len(d) for d in datasets])
+            self.sampler = WeightedRandomSampler(
+                weights=self.weights,
+                num_samples=self.num_samples,
+                replacement=self.replacement
+            )
+        else:
+            self.task_batches = [len(d) // batch_size for d in datasets]
+            self.num_samples = sum(self.task_batches) * batch_size
+            self.sampler = WeightedBatchSampler(
+                weights=self.weights,
+                num_samples=self.num_samples,
+                batch_size=self.batch_size,
+                replacement=self.replacement
+            )
+    def __iter__(self) -> Iterator[int]:
+        return iter(self.sampler)
+    def __len__(self) -> int:
+        return self.num_samples
+class WeightedBatchSampler(Sampler[int]):
+    weights: torch.Tensor
+    num_samples: int
+    batch_size: int
+    replacement: bool
+    def __init__(
+            self,
+            weights: Sequence[float],
+            num_samples: int,
+            batch_size: int,
+            replacement: bool = True,
+            generator=None
+    ) -> None:
+        if not isinstance(batch_size, int) or isinstance(batch_size, bool) or \
+                batch_size <= 0:
+            raise ValueError("batch_size should be a positive integer value, "
+                             "but got batch_size={}".format(batch_size))
+        if not isinstance(num_samples, int) or isinstance(num_samples, bool) or \
+                num_samples <= 0:
+            raise ValueError("num_samples should be a positive integer "
+                             "value, but got num_samples={}".format(num_samples))
+        if not isinstance(replacement, bool):
+            raise ValueError("replacement should be a boolean value, but got "
+                             "replacement={}".format(replacement))
+        weights_tensor = torch.as_tensor(weights, dtype=torch.double)
+        if len(weights_tensor.shape) != 1:
+            raise ValueError("weights should be a 1d sequence but given "
+                             "weights have shape {}".format(tuple(weights_tensor.shape)))
+        self.weights = weights_tensor
+        self.num_samples = num_samples
+        self.batch_size = batch_size
+        self.num_batches = num_samples // batch_size
+        self.replacement = replacement
+        self.generator = generator
+    def __iter__(self) -> Iterator[int]:
+        rand_tensor = torch.multinomial(self.weights, self.num_batches, self.replacement, generator=self.generator)
+        rand_tensor = rand_tensor.repeat_interleave(self.batch_size)
+        yield from iter(rand_tensor.tolist())
+    def __len__(self) -> int:
+        return self.num_samples

robohusky/compression.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import dataclasses
+import gc
+import glob
+import os
+from accelerate import init_empty_weights
+from accelerate.utils import set_module_tensor_to_device
+import torch
+from torch import Tensor
+from torch.nn import functional as F
+import torch.nn as nn
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+@dataclasses.dataclass
+class CompressionConfig:
+    """Group-wise quantization."""
+    num_bits: int
+    group_size: int
+    group_dim: int
+    symmetric: bool
+    enabled: bool = True
+default_compression_config = CompressionConfig(
+    num_bits=8, group_size=256, group_dim=1, symmetric=True, enabled=True
+)
+class CLinear(nn.Module):
+    """Compressed Linear Layer."""
+    def __init__(self, weight=None, bias=None, device=None):
+        super().__init__()
+        if weight is None:
+            self.weight = None
+        elif isinstance(weight, Tensor):
+            self.weight = compress(weight.data.to(device), default_compression_config)
+        else:
+            self.weight = weight
+        self.bias = bias
+    def forward(self, input: Tensor) -> Tensor:
+        weight = decompress(self.weight, default_compression_config)
+        if self.bias is None:
+            return F.linear(input.to(weight.dtype), weight)
+        return F.linear(input.to(weight.dtype), weight, self.bias.to(weight.dtype))
+def compress_module(module, target_device):
+    for attr_str in dir(module):
+        target_attr = getattr(module, attr_str)
+        if type(target_attr) == torch.nn.Linear:
+            setattr(
+                module,
+                attr_str,
+                CLinear(target_attr.weight, target_attr.bias, target_device),
+            )
+    for name, child in module.named_children():
+        compress_module(child, target_device)
+def get_compressed_list(module, prefix=""):
+    compressed_list = []
+    for attr_str in dir(module):
+        target_attr = getattr(module, attr_str)
+        if type(target_attr) == torch.nn.Linear:
+            full_name = (
+                f"{prefix}.{attr_str}.weight" if prefix else f"{attr_str}.weight"
+            )
+            compressed_list.append(full_name)
+    for name, child in module.named_children():
+        child_prefix = f"{prefix}.{name}" if prefix else name
+        for each in get_compressed_list(child, child_prefix):
+            compressed_list.append(each)
+    return compressed_list
+def apply_compressed_weight(module, compressed_state_dict, target_device, prefix=""):
+    for attr_str in dir(module):
+        target_attr = getattr(module, attr_str)
+        if type(target_attr) == torch.nn.Linear:
+            full_name = (
+                f"{prefix}.{attr_str}.weight" if prefix else f"{attr_str}.weight"
+            )
+            setattr(
+                module,
+                attr_str,
+                CLinear(
+                    compressed_state_dict[full_name], target_attr.bias, target_device
+                ),
+            )
+    for name, child in module.named_children():
+        child_prefix = f"{prefix}.{name}" if prefix else name
+        apply_compressed_weight(
+            child, compressed_state_dict, target_device, child_prefix
+        )
+def load_compress_model(model_path, device, torch_dtype, use_fast=False):
+    # partially load model
+    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=use_fast)
+    base_pattern = os.path.join(model_path, "pytorch_model*.bin")
+    files = glob.glob(base_pattern)
+    with init_empty_weights():
+        config = AutoConfig.from_pretrained(
+            model_path, low_cpu_mem_usage=True, torch_dtype=torch_dtype
+        )
+        model = AutoModelForCausalLM.from_config(config)
+        linear_weights = get_compressed_list(model)
+    compressed_state_dict = {}
+    for filename in tqdm(files):
+        tmp_state_dict = torch.load(filename)
+        for name in tmp_state_dict:
+            if name in linear_weights:
+                tensor = tmp_state_dict[name].to(device).data.to(torch_dtype)
+                compressed_state_dict[name] = compress(
+                    tensor, default_compression_config
+                )
+            else:
+                compressed_state_dict[name] = tmp_state_dict[name].to(device)
+            tmp_state_dict[name] = None
+            tensor = None
+            gc.collect()
+            torch.cuda.empty_cache()
+    for name in model.state_dict():
+        if name not in linear_weights:
+            set_module_tensor_to_device(
+                model, name, device, value=compressed_state_dict[name]
+            )
+    apply_compressed_weight(model, compressed_state_dict, device)
+    model.to(device)
+    return model, tokenizer
+def compress(tensor, config):
+    """Simulate group-wise quantization."""
+    if not config.enabled:
+        return tensor
+    group_size, num_bits, group_dim, symmetric = (
+        config.group_size,
+        config.num_bits,
+        config.group_dim,
+        config.symmetric,
+    )
+    assert num_bits <= 8
+    original_shape = tensor.shape
+    num_groups = (original_shape[group_dim] + group_size - 1) // group_size
+    new_shape = (
+        original_shape[:group_dim]
+        + (num_groups, group_size)
+        + original_shape[group_dim + 1 :]
+    )
+    # Pad
+    pad_len = (group_size - original_shape[group_dim] % group_size) % group_size
+    if pad_len != 0:
+        pad_shape = (
+            original_shape[:group_dim] + (pad_len,) + original_shape[group_dim + 1 :]
+        )
+        tensor = torch.cat(
+            [tensor, torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)],
+            dim=group_dim,
+        )
+    data = tensor.view(new_shape)
+    # Quantize
+    if symmetric:
+        B = 2 ** (num_bits - 1) - 1
+        scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0]
+        data = data * scale
+        data = data.clamp_(-B, B).round_().to(torch.int8)
+        return data, scale, original_shape
+    else:
+        B = 2**num_bits - 1
+        mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0]
+        mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0]
+        scale = B / (mx - mn)
+        data = data - mn
+        data.mul_(scale)
+        data = data.clamp_(0, B).round_().to(torch.uint8)
+        return data, mn, scale, original_shape
+def decompress(packed_data, config):
+    """Simulate group-wise dequantization."""
+    if not config.enabled:
+        return packed_data
+    group_size, num_bits, group_dim, symmetric = (
+        config.group_size,
+        config.num_bits,
+        config.group_dim,
+        config.symmetric,
+    )
+    # Dequantize
+    if symmetric:
+        data, scale, original_shape = packed_data
+        data = data / scale
+    else:
+        data, mn, scale, original_shape = packed_data
+        data = data / scale
+        data.add_(mn)
+    # Unpad
+    pad_len = (group_size - original_shape[group_dim] % group_size) % group_size
+    if pad_len:
+        padded_original_shape = (
+            original_shape[:group_dim]
+            + (original_shape[group_dim] + pad_len,)
+            + original_shape[group_dim + 1 :]
+        )
+        data = data.reshape(padded_original_shape)
+        indices = [slice(0, x) for x in original_shape]
+        return data[indices].contiguous()
+    else:
+        return data.view(original_shape)

robohusky/configuration_husky.py ADDED Viewed

	@@ -0,0 +1,326 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Husky model configuration"""
+import copy
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from transformers.utils import logging
+from transformers.models.auto import CONFIG_MAPPING
+logger = logging.get_logger(__name__)
+HUSKY_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "wofmanaf/husky-7b": "https://huggingface.co/wofmanaf/husky-7b/resolve/main/config.json",
+}
+class HuskyVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HuskyVisionModel`]. It is used to
+    instantiate a Husky vision encoder according to the specified arguments, defining the model architecture.
+    Instantiating a configuration defaults will yield a similar configuration to that of the Husky architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
+            to 1e-5): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+    """
+    model_type = "husky_vision_model"
+    def __init__(
+            self,
+            hidden_size=1408,
+            intermediate_size=6144,
+            projection_dim=512,
+            num_hidden_layers=39,
+            num_attention_heads=16,
+            num_channels=3,
+            image_size=224,
+            patch_size=14,
+            hidden_act="gelu",
+            layer_norm_eps=0.00001,
+            dropout=0.0,
+            attention_dropout=0.0,
+            initializer_range=1e-10,
+            initializer_factor=1.0,
+            qkv_bias=True,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the vision config dict if we are loading from HuskyConfig
+        if config_dict.get("model_type") == "husky":
+            config_dict = config_dict["vision_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class HuskyQFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HuskyQFormerModel`]. It is used to
+    instantiate a Husky Querying Transformer (Q-Former) model according to the specified arguments, defining the
+    model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the Husky [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5)
+    architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    Read the documentation from [`PretrainedConfig`] for more information.
+    Note that [`HuskyQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling the model.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+    """
+    model_type = "husky_qformer"
+    def __init__(
+            self,
+            vocab_size=30522,
+            hidden_size=768,
+            num_hidden_layers=12,
+            num_attention_heads=12,
+            intermediate_size=3072,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            initializer_range=0.02,
+            layer_norm_eps=1e-12,
+            pad_token_id=0,
+            position_embedding_type="absolute",
+            classifier_dropout=None,
+            cross_attention_frequency=2,
+            encoder_hidden_size=1408,
+            **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.classifier_dropout = classifier_dropout
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the qformer config dict if we are loading from HuskyConfig
+        if config_dict.get("model_type") == "husky":
+            config_dict = config_dict["qformer_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class HuskyConfig(PretrainedConfig):
+    r"""
+    [`HuskyConfig`] is the configuration class to store the configuration of a
+    [`HuskyForConditionalGeneration`]. It is used to instantiate a Husky model according to the specified
+    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the Husky
+    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`HuskyVisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`HuskyQFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    """
+    model_type = "husky"
+    is_composition = True
+    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+        super().__init__(**kwargs)
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the HuskyVisionConfig with default values.")
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the HuskyQFormerConfig with default values.")
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
+        self.vision_config = HuskyVisionConfig(**vision_config)
+        self.qformer_config = HuskyQFormerConfig(**qformer_config)
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+        self.tie_word_embeddings = self.text_config.tie_word_embeddings
+        self.is_encoder_decoder = self.text_config.is_encoder_decoder
+        self.num_query_tokens = num_query_tokens
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+    @classmethod
+    def from_vision_qformer_text_configs(
+            cls,
+            vision_config: HuskyVisionConfig,
+            qformer_config: HuskyQFormerConfig,
+            text_config: PretrainedConfig,
+            **kwargs,
+    ):
+        r"""
+        Instantiate a [`HuskyConfig`] (or a derived class) from a Husky vision model, Q-Former and
+        language model configurations.
+        Returns:
+            [`HuskyConfig`]: An instance of a configuration object
+        """
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["qformer_config"] = self.qformer_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+if __name__ == '__main__':
+    config = HuskyConfig.from_pretrain

robohusky/constants.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from enum import IntEnum
+import os
+# For the gradio web server
+SERVER_ERROR_MSG = (
+    "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+)
+MODERATION_MSG = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE FIX YOUR INPUT AND TRY AGAIN."
+CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION."
+INPUT_CHAR_LEN_LIMIT = 2560
+CONVERSATION_LEN_LIMIT = 50
+LOGDIR = "."
+# For the controller and workers(could be overwritten through ENV variables.)
+CONTROLLER_HEART_BEAT_EXPIRATION = int(
+    os.getenv("FASTCHAT_CONTROLLER_HEART_BEAT_EXPIRATION", 90)
+)
+WORKER_HEART_BEAT_INTERVAL = int(os.getenv("FASTCHAT_WORKER_HEART_BEAT_INTERVAL", 30))
+WORKER_API_TIMEOUT = int(os.getenv("FASTCHAT_WORKER_API_TIMEOUT", 100))
+WORKER_API_EMBEDDING_BATCH_SIZE = int(os.getenv("WORKER_API_EMBEDDING_BATCH_SIZE", 4))
+class ErrorCode(IntEnum):
+    """
+    https://platform.openai.com/docs/guides/error-codes/api-errors
+    """
+    VALIDATION_TYPE_ERROR = 40001
+    INVALID_AUTH_KEY = 40101
+    INCORRECT_AUTH_KEY = 40102
+    NO_PERMISSION = 40103
+    INVALID_MODEL = 40301
+    PARAM_OUT_OF_RANGE = 40302
+    CONTEXT_OVERFLOW = 40303
+    RATE_LIMIT = 42901
+    QUOTA_EXCEEDED = 42902
+    ENGINE_OVERLOADED = 42903
+    INTERNAL_ERROR = 50001
+    CUDA_OUT_OF_MEMORY = 50002
+    GRADIO_REQUEST_ERROR = 50003
+    GRADIO_STREAM_UNKNOWN_ERROR = 50004
+    CONTROLLER_NO_WORKER = 50005
+    CONTROLLER_WORKER_TIMEOUT = 50006

robohusky/conversation.py ADDED Viewed

	@@ -0,0 +1,511 @@

+"""
+Conversation prompt templates.
+"""
+import dataclasses
+from enum import auto, Enum
+from typing import List, Any, Dict
+class SeparatorStyle(Enum):
+    """Separator styles."""
+    ADD_COLON_SINGLE = auto()
+    ADD_COLON_TWO = auto()
+    ADD_COLON_SPACE_SINGLE = auto()
+    NO_COLON_SINGLE = auto()
+    ADD_NEW_LINE_SINGLE = auto()
+    DOLLY = auto()
+    RWKV = auto()
+    PHOENIX = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    # The name of this template
+    name: str
+    # The system prompt
+    system: str
+    # Two roles
+    roles: List[str]
+    # All messages. Each item is (role, message).
+    messages: List[List[str]]
+    # The number of few shot examples
+    offset: int
+    # Separators
+    sep_style: SeparatorStyle
+    sep: str
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: str = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ": "  # must be end with a space
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+            ret = self.system
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.DOLLY:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ":\n" + message + seps[i % 2]
+                    if i % 2 == 1:
+                        ret += "\n\n"
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.RWKV:
+            ret = self.system
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += (
+                            role
+                            + ": "
+                            + message.replace("\r\n", "\n").replace("\n\n", "\n")
+                    )
+                    ret += "\n\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.PHOENIX:
+            ret = self.system
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + "<s>" + message + "</s>"
+                else:
+                    ret += role + ": " + "<s>"
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+    def update_last_message(self, message: str):
+        """Update the last output.
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format"""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        ret = [{"role": "system", "content": self.system}]
+        for i, (_, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                ret.append({"role": "user", "content": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "assistant", "content": msg})
+        return ret
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+        )
+    def dict(self):
+        return {
+            "name": self.name,
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+        }
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert template.name not in conv_templates, f"{template.name} has been registered."
+    conv_templates[template.name] = template
+def get_conv_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    return conv_templates[name].copy()
+# A template with one conversation example
+register_conv_template(
+    Conversation(
+        name="one_shot",
+        system="A chat between a curious human and an artificial intelligence assistant. "
+               "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant"),
+        messages=(
+            (
+                "Human",
+                "Got any creative ideas for a 10 year old’s birthday?",
+            ),
+            (
+                "Assistant",
+                """Of course! Here are some creative ideas for a 10-year-old's birthday party:
+1. Treasure Hunt: Organize a treasure hunt in your backyard or nearby park. Create clues and riddles for the kids to solve, leading them to hidden treasures and surprises.
+2. Science Party: Plan a science-themed party where kids can engage in fun and interactive experiments. You can set up different stations with activities like making slime, erupting volcanoes, or creating simple chemical reactions.
+3. Outdoor Movie Night: Set up a backyard movie night with a projector and a large screen or white sheet. Create a cozy seating area with blankets and pillows, and serve popcorn and snacks while the kids enjoy a favorite movie under the stars.
+4. DIY Crafts Party: Arrange a craft party where kids can unleash their creativity. Provide a variety of craft supplies like beads, paints, and fabrics, and let them create their own unique masterpieces to take home as party favors.
+5. Sports Olympics: Host a mini Olympics event with various sports and games. Set up different stations for activities like sack races, relay races, basketball shooting, and obstacle courses. Give out medals or certificates to the participants.
+6. Cooking Party: Have a cooking-themed party where the kids can prepare their own mini pizzas, cupcakes, or cookies. Provide toppings, frosting, and decorating supplies, and let them get hands-on in the kitchen.
+7. Superhero Training Camp: Create a superhero-themed party where the kids can engage in fun training activities. Set up an obstacle course, have them design their own superhero capes or masks, and organize superhero-themed games and challenges.
+8. Outdoor Adventure: Plan an outdoor adventure party at a local park or nature reserve. Arrange activities like hiking, nature scavenger hunts, or a picnic with games. Encourage exploration and appreciation for the outdoors.
+Remember to tailor the activities to the birthday child's interests and preferences. Have a great celebration!""",
+            ),
+        ),
+        offset=2,
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n### ",
+        stop_str="###",
+    )
+)
+# Vicuna v1.1 template
+register_conv_template(
+    Conversation(
+        name="vicuna_v1.1",
+        system="A chat between a curious user and an artificial intelligence assistant. "
+               "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+        roles=("USER", "ASSISTANT"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+# Husky template
+register_conv_template(
+    Conversation(
+        name="husky",
+        system="",
+        roles=("Human", "Assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+# Koala default template
+register_conv_template(
+    Conversation(
+        name="koala_v1",
+        system="BEGINNING OF CONVERSATION:",
+        roles=("USER", "GPT"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+# Alpaca default template
+register_conv_template(
+    Conversation(
+        name="alpaca",
+        system="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
+        roles=("### Instruction:", "### Response:"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        sep="\n\n",
+    )
+)
+# Dolly V2 default template
+register_conv_template(
+    Conversation(
+        name="dolly_v2",
+        system="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
+        roles=("### Instruction", "### Response"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.DOLLY,
+        sep="\n\n",
+        sep2="### End",
+    )
+)
+# OpenAssistant Pythia default template
+register_conv_template(
+    Conversation(
+        name="oasst_pythia",
+        system="",
+        roles=("<|prompter|>", "<|assistant|>"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="<|endoftext|>",
+    )
+)
+# StableLM Alpha default template
+register_conv_template(
+    Conversation(
+        name="stablelm",
+        system="""<|SYSTEM|># StableLM Tuned (Alpha version)
+- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
+- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
+- StableLM will refuse to participate in anything that could harm a human.
+""",
+        roles=("<|USER|>", "<|ASSISTANT|>"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_token_ids=[50278, 50279, 50277, 1, 0],
+    )
+)
+# Baize default template
+register_conv_template(
+    Conversation(
+        name="baize",
+        system="The following is a conversation between a human and an AI assistant named Baize (named after a mythical creature in Chinese folklore). Baize is an open-source AI assistant developed by UCSD and Sun Yat-Sen University. The human and the AI assistant take turns chatting. Human statements start with [|Human|] and AI assistant statements start with [|AI|]. The AI assistant always provides responses in as much detail as possible, and in Markdown format. The AI assistant always declines to engage with topics, questions and instructions related to unethical, controversial, or sensitive issues. Complete the transcript in exactly that format.\n",
+        roles=("[|Human|]", "[|AI|]"),
+        messages=(
+            ("[|Human|]", "Hello!"),
+            ("[|AI|]", "Hi!"),
+        ),
+        offset=2,
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="\n",
+        stop_str="[|Human|]",
+    )
+)
+# RWKV-4-Raven default template
+register_conv_template(
+    Conversation(
+        name="rwkv",
+        system="",
+        roles=("Bob", "Alice"),
+        messages=(
+            ("Bob", "hi"),
+            (
+                "Alice",
+                "Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.",
+            ),
+        ),
+        offset=2,
+        sep_style=SeparatorStyle.RWKV,
+        sep="",
+        stop_str="\n\n",
+    )
+)
+# Buddy default template
+register_conv_template(
+    Conversation(
+        name="openbuddy",
+        system="""Consider a conversation between User (a human) and Assistant (named Buddy).
+Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team. GitHub: https://github.com/OpenBuddy/OpenBuddy
+Buddy cannot access the Internet.
+Buddy can fluently speak the user's language (e.g. English, Chinese).
+Buddy can generate poems, stories, code, essays, songs, parodies, and more.
+Buddy possesses vast knowledge about the world, history, and culture.
+Buddy's responses are always safe, creative, high-quality, human-like, and interesting.
+Buddy strictly refuses to discuss political, NSFW, or other unsafe topics.
+User: Hi.
+Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?""",
+        roles=("User", "Assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+    )
+)
+# Phoenix default template
+register_conv_template(
+    Conversation(
+        name="phoenix",
+        system="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+        roles=("Human", "Assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.PHOENIX,
+        sep="</s>",
+    )
+)
+# ChatGPT default template
+register_conv_template(
+    Conversation(
+        name="chatgpt",
+        system="You are a helpful assistant.",
+        roles=("user", "assistant"),
+        messages=(),
+        offset=0,
+        sep_style=None,
+        sep=None,
+    )
+)
+# Claude default template
+register_conv_template(
+    Conversation(
+        name="claude",
+        system="",
+        roles=("Human", "Assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n\n",
+    )
+)
+# MPT default template
+register_conv_template(
+    Conversation(
+        name="mpt",
+        system="""<|im_start|>system
+- You are a helpful assistant chatbot trained by MosaicML.
+- You answer questions.
+- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.
+""",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        sep="<|im_end|>",
+        stop_token_ids=[50278, 0],
+    )
+)
+# Bard default template
+register_conv_template(
+    Conversation(
+        name="bard",
+        system="",
+        roles=("0", "1"),
+        messages=(),
+        offset=0,
+        sep_style=None,
+        sep=None,
+    )
+)
+# BiLLa default template
+register_conv_template(
+    Conversation(
+        name="billa",
+        system="",
+        roles=("Human", "Assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
+        sep="\n",
+        stop_str="Human:",
+    )
+)
+# RedPajama INCITE default template
+register_conv_template(
+    Conversation(
+        name="redpajama-incite",
+        system="",
+        roles=("<human>", "<bot>"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+        stop_str="<human>",
+    )
+)
+# h2oGPT default template
+register_conv_template(
+    Conversation(
+        name="h2ogpt",
+        system="",
+        roles=("<|prompt|>", "<|answer|>"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="</s>",
+    )
+)
+if __name__ == "__main__":
+    conv = get_conv_template("husky")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())

robohusky/convert_fp16.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Usage:
+srun -p INTERN2 --job-name='convert_2_fp16' --gres=gpu:0 --cpus-per-task=8 --quotatype="auto" python -u husky/convert_fp16.py --in-checkpoint work_dirs/llm/husky-13b/zh_bell/checkpoint-9500 --out-checkpoint work_dirs/llm/husky-13b/zh_bell/
+"""
+import argparse
+import os.path
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+def convert_fp16(in_checkpoint, out_checkpoint):
+    tokenizer = AutoTokenizer.from_pretrained(in_checkpoint, use_fast=False)
+    model = AutoModelForCausalLM.from_pretrained(
+        in_checkpoint, torch_dtype=torch.float16, low_cpu_mem_usage=False
+    )
+    if not os.path.exists(out_checkpoint):
+        os.mkdir(out_checkpoint)
+    model.save_pretrained(out_checkpoint)
+    tokenizer.save_pretrained(out_checkpoint)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-checkpoint", type=str, help="Path to the model")
+    parser.add_argument("--out-checkpoint", type=str, help="Path to the output model")
+    args = parser.parse_args()
+    convert_fp16(args.in_checkpoint, args.out_checkpoint)

robohusky/convert_husky_fp16.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+Usage:
+srun -p INTERN2 --job-name='convert_2_fp16' --gres=gpu:0 --cpus-per-task=8 --quotatype="auto" python -u husky/convert_husky_fp16.py --in-checkpoint work_dirs/husky_v3/multi_align/checkpoint-48000 --out-checkpoint work_dirs/husky_v3/multi_align_fp16
+"""
+import argparse
+import os.path
+from transformers import AutoTokenizer
+from husky.model.modeling_husky_multi import HuskyForConditionalGeneration
+import torch
+def convert_fp16(in_checkpoint, out_checkpoint):
+    tokenizer = AutoTokenizer.from_pretrained(in_checkpoint, use_fast=False)
+    model = HuskyForConditionalGeneration.from_pretrained(
+        in_checkpoint, torch_dtype=torch.float16, low_cpu_mem_usage=False
+    )
+    if not os.path.exists(out_checkpoint):
+        os.mkdir(out_checkpoint)
+    model.save_pretrained(out_checkpoint)
+    tokenizer.save_pretrained(out_checkpoint)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-checkpoint", type=str, help="Path to the model")
+    parser.add_argument("--out-checkpoint", type=str, help="Path to the output model")
+    args = parser.parse_args()
+    convert_fp16(args.in_checkpoint, args.out_checkpoint)

robohusky/convert_reward_fp16.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Usage:
+srun -p INTERN2 --job-name='convert_2_fp16' --gres=gpu:0 --cpus-per-task=8 --quotatype="auto" python -u husky/convert_reward_fp16.py --in-checkpoint work_dirs/llm/Ziya-LLaMA-7B-Reward --out-checkpoint work_dirs/llm/reward_model
+"""
+import argparse
+import os.path
+from transformers import LlamaTokenizer, AutoModelForSequenceClassification
+import torch
+def convert_fp16(in_checkpoint, out_checkpoint):
+    tokenizer = LlamaTokenizer.from_pretrained(in_checkpoint, use_fast=False)
+    model = AutoModelForSequenceClassification.from_pretrained(
+        in_checkpoint, torch_dtype=torch.float16, low_cpu_mem_usage=False
+    )
+    if not os.path.exists(out_checkpoint):
+        os.mkdir(out_checkpoint)
+    model.save_pretrained(out_checkpoint)
+    tokenizer.save_pretrained(out_checkpoint)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-checkpoint", type=str, help="Path to the model")
+    parser.add_argument("--out-checkpoint", type=str, help="Path to the output model")
+    args = parser.parse_args()
+    convert_fp16(args.in_checkpoint, args.out_checkpoint)

robohusky/dist_utils.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import functools
+import os
+import socket
+import subprocess
+from collections import OrderedDict
+import torch
+import torch.multiprocessing as mp
+from torch import distributed as dist
+def _find_free_port():
+    # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    # Binding to port 0 will cause the OS to find an available port for us
+    sock.bind(('', 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    # NOTE: there is still a chance the port could be taken by other processes.
+    return port
+def _is_free_port(port):
+    ips = socket.gethostbyname_ex(socket.gethostname())[-1]
+    ips.append('localhost')
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return all(s.connect_ex((ip, port)) != 0 for ip in ips)
+def init_dist(launcher, backend='nccl', **kwargs):
+    if mp.get_start_method(allow_none=True) is None:
+        mp.set_start_method('spawn')
+    if launcher == 'pytorch':
+        _init_dist_pytorch(backend, **kwargs)
+    elif launcher == 'mpi':
+        _init_dist_mpi(backend, **kwargs)
+    elif launcher == 'slurm':
+        _init_dist_slurm(backend, **kwargs)
+    else:
+        raise ValueError(f'Invalid launcher type: {launcher}')
+def _init_dist_pytorch(backend, **kwargs):
+    # TODO: use local_rank instead of rank % num_gpus
+    rank = int(os.environ['RANK'])
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(rank % num_gpus)
+    dist.init_process_group(backend=backend, **kwargs)
+def _init_dist_mpi(backend, **kwargs):
+    local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+    torch.cuda.set_device(local_rank)
+    if 'MASTER_PORT' not in os.environ:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    if 'MASTER_ADDR' not in os.environ:
+        raise KeyError('The environment variable MASTER_ADDR is not set')
+    os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
+    os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
+    dist.init_process_group(backend=backend, **kwargs)
+def _init_dist_slurm(backend, port=None):
+    """Initialize slurm distributed training environment.
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(proc_id % num_gpus)
+    addr = subprocess.getoutput(
+        f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # if torch.distributed default port(29500) is available
+        # then use it, else find a free port
+        if _is_free_port(29500):
+            os.environ['MASTER_PORT'] = '29500'
+        else:
+            os.environ['MASTER_PORT'] = str(_find_free_port())
+    # use MASTER_ADDR in the environment variable if it already exists
+    if 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
+    os.environ['RANK'] = str(proc_id)
+    dist.init_process_group(backend=backend)

robohusky/llama2_flash_attn_monkey_patch.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import warnings
+from typing import Optional, Tuple
+import torch
+from flash_attn import __version__ as flash_attn_version
+from flash_attn.bert_padding import pad_input, unpad_input
+from flash_attn.flash_attn_interface import (
+    flash_attn_func,
+    flash_attn_varlen_kvpacked_func,
+)
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    LlamaModel,
+    rotate_half,
+)
+def apply_rotary_pos_emb(q, k, cos_sin, position_ids):
+    gather_indices = position_ids[:, :, None, None]  # [bsz, seq_len, 1, 1]
+    gather_indices = gather_indices.repeat(
+        1, 1, cos_sin[0].shape[1], cos_sin[0].shape[3]
+    )
+    bsz = gather_indices.shape[0]
+    cos, sin = (
+        torch.gather(x.transpose(1, 2).repeat(bsz, 1, 1, 1), 1, gather_indices)
+        for x in cos_sin
+    )
+    q, k = ((x * cos) + (rotate_half(x) * sin) for x in (q, k))
+    return q, k
+def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        padding_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+    bsz, q_len, _ = hidden_states.size()
+    kv_heads = getattr(self, "num_key_value_heads", self.num_heads)
+    q, k, v = (
+        op(hidden_states).view(bsz, q_len, nh, self.head_dim)
+        for op, nh in (
+        (self.q_proj, self.num_heads),
+        (self.k_proj, kv_heads),
+        (self.v_proj, kv_heads),
+    )
+    )
+    # shape: (b, s, num_heads, head_dim)
+    kv_seq_len = k.shape[1]
+    past_kv_len = 0
+    if past_key_value is not None:
+        past_kv_len = past_key_value[0].shape[2]
+        kv_seq_len += past_kv_len
+    cos_sin = self.rotary_emb(v, seq_len=kv_seq_len)
+    q, k = apply_rotary_pos_emb(q, k, cos_sin, position_ids)
+    if past_key_value is not None:
+        assert (
+                flash_attn_version >= "2.1.0"
+        ), "past_key_value support requires flash-attn >= 2.1.0"
+        # reuse k, v
+        k = torch.cat([past_key_value[0].transpose(1, 2), k], dim=1)
+        v = torch.cat([past_key_value[1].transpose(1, 2), v], dim=1)
+    past_key_value = (k.transpose(1, 2), v.transpose(1, 2)) if use_cache else None
+    if attention_mask is None:
+        output = flash_attn_func(q, k, v, 0.0, softmax_scale=None, causal=True).view(
+            bsz, q_len, -1
+        )
+    else:
+        q, indices, cu_q_lens, max_s = unpad_input(q, attention_mask[:, -q_len:])
+        # We can skip concat and call unpad twice but seems better to call unpad only once.
+        kv, _, cu_k_lens, max_k = unpad_input(
+            torch.stack((k, v), dim=2), attention_mask
+        )
+        output_unpad = flash_attn_varlen_kvpacked_func(
+            q,
+            kv,
+            cu_q_lens,
+            cu_k_lens,
+            max_s,
+            max_k,
+            0.0,
+            softmax_scale=None,
+            causal=True,
+        )
+        output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
+        output = pad_input(output_unpad, indices, bsz, q_len)
+    return self.o_proj(output), None, past_key_value
+# Disable the transformation of the attention mask in LlamaModel as flash attention
+# takes a boolean key_padding_mask. Fills in the past kv length for use in forward.
+def _prepare_decoder_attention_mask(
+        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    if past_key_values_length > 0 and attention_mask is not None:
+        attention_mask = torch.cat(
+            (
+                torch.full(
+                    (input_shape[0], past_key_values_length),
+                    True,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                ),
+                attention_mask,
+            ),
+            dim=-1,
+        )
+    if attention_mask is not None and torch.all(attention_mask):
+        return None  # This uses the faster call when training with full samples
+    return attention_mask
+def replace_llama_attn_with_flash_attn():
+    cuda_major, cuda_minor = torch.cuda.get_device_capability()
+    if cuda_major < 8:
+        warnings.warn(
+            "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
+            "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
+        )
+    LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
+    LlamaAttention.forward = forward
+def test():
+    from robohusky.train.llama_flash_attn_monkey_patch import forward as fastchat_forward
+    from transformers.models.llama.configuration_llama import LlamaConfig
+    config = LlamaConfig(
+        hidden_size=1024,
+        intermediate_size=128,
+        num_hidden_layers=1,
+        num_attention_heads=8,
+        max_position_embeddings=16,
+    )
+    device = torch.device("cuda")
+    model = LlamaModel(config)
+    attn = LlamaAttention(config).to(device).half()
+    bsz, hs, seqlen = 2, config.hidden_size, config.max_position_embeddings
+    position_ids = torch.arange(seqlen, dtype=torch.long, device=device).view(
+        -1, seqlen
+    )
+    mask = torch.full((bsz, seqlen), True, dtype=torch.bool, device=device)
+    for i in range(4):
+        hidden = torch.rand((bsz, seqlen, hs), dtype=torch.float16, device=device)
+        if i:
+            mask[0, -i:] = False
+            mask[1, :i] = False
+        lmask = model._prepare_decoder_attention_mask(mask, hidden.shape[:2], hidden, 0)
+        ref, _, _ = attn.forward(
+            hidden, attention_mask=lmask, position_ids=position_ids
+        )
+        fast, _, _ = fastchat_forward(
+            attn, hidden, attention_mask=mask, position_ids=position_ids
+        )
+        lmask = _prepare_decoder_attention_mask(
+            model, mask, hidden.shape[:2], hidden, 0
+        )
+        test, _, _ = forward(
+            attn, hidden, attention_mask=lmask, position_ids=position_ids
+        )
+        print(f"Mean(abs(ref)) = {torch.mean(torch.abs(ref))}")
+        print(f"Mean(abs(ref - fast)) = {torch.mean(torch.abs(ref - fast))}")
+        print(f"Mean(abs(ref - test)) = {torch.mean(torch.abs(ref - test))}")
+        print(f"Mean(abs(fast - test)) = {torch.mean(torch.abs(fast - test))}")
+        print(f"allclose(fast, test) = {torch.allclose(fast, test)}")
+    with torch.no_grad():
+        # Also check that past_kv is handled properly
+        hidden = torch.rand((bsz, seqlen, hs), dtype=torch.float16, device=device)
+        part_len = seqlen // 4
+        assert part_len * 4 == seqlen
+        mask = torch.full((bsz, seqlen), True, dtype=torch.bool, device=device)
+        mask[0, -2:] = False
+        lmask = _prepare_decoder_attention_mask(
+            model, mask, hidden.shape[:2], hidden, 0
+        )
+        oneshot, _, _ = forward(
+            attn, hidden, attention_mask=lmask, position_ids=position_ids
+        )
+        parts = []
+        past_kv, past_kv_len = None, 0
+        for i in range(4):
+            start = part_len * i
+            end = start + part_len
+            hidden_part = hidden[:, start:end, ...]
+            lmask = _prepare_decoder_attention_mask(
+                model,
+                mask[:, start:end],
+                hidden_part.shape[:2],
+                hidden_part,
+                past_kv_len,
+            )
+            part, _, past_kv = forward(
+                attn,
+                hidden_part.clone(),
+                attention_mask=lmask,
+                position_ids=position_ids[:, start:end],
+                past_key_value=past_kv,
+                use_cache=True,
+            )
+            parts.append(part)
+            past_kv_len = past_kv[0].shape[2]
+        print(
+            f"allclose(oneshot[:, 0], parts[0]) = {torch.allclose(oneshot[:, :part_len], parts[0])}"
+        )
+        print(
+            f"allclose(oneshot, parts) = {torch.allclose(oneshot, torch.cat(parts, dim=1))}"
+        )
+if __name__ == "__main__":
+    test()

robohusky/model/__init__.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+_import_structure = {
+    "configuration_husky": [
+        "HUSKY_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "HuskyConfig",
+        "HuskyQFormerConfig",
+        "HuskyVisionConfig",
+    ],
+    "processing_husky": ["HuskyProcessor"],
+}
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_husky"] = [
+        "HUSKY_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "HuskyModel",
+        "HuskyQFormerModel",
+        "HuskyPreTrainedModel",
+        "HuskyForConditionalGeneration",
+        "HuskyVisionModel",
+    ]
+if TYPE_CHECKING:
+    from .configuration_husky import (
+        HUSKY_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        HuskyConfig,
+        HuskyVisionConfig,
+        HuskyQFormerConfig
+    )
+    from .processing_husky import HuskyProcessor
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_husky import (
+            HUSKY_PRETRAINED_MODEL_ARCHIVE_LIST,
+            HuskyForConditionalGeneration,
+            HuskyModel,
+            HuskyPreTrainedModel,
+            HuskyQFormerModel,
+            HuskyVisionModel,
+        )
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

robohusky/model/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (1.14 kB). View file

robohusky/model/__pycache__/configuration_husky.cpython-38.pyc ADDED Viewed

Binary file (13.4 kB). View file

robohusky/model/__pycache__/modeling_husky_embody2.cpython-38.pyc ADDED Viewed

Binary file (54.6 kB). View file

robohusky/model/compression.py ADDED Viewed

File without changes

robohusky/model/configuration_husky.py ADDED Viewed

	@@ -0,0 +1,331 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Husky model configuration"""
+import copy
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from transformers.utils import logging
+from transformers.models.auto import CONFIG_MAPPING
+logger = logging.get_logger(__name__)
+HUSKY_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "wofmanaf/husky-7b": "https://huggingface.co/wofmanaf/husky-7b/resolve/main/config.json",
+}
+class HuskyVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HuskyVisionModel`]. It is used to
+    instantiate a Husky vision encoder according to the specified arguments, defining the model architecture.
+    Instantiating a configuration defaults will yield a similar configuration to that of the Husky architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
+            to 1e-5): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+    """
+    model_type = "husky_vision_model"
+    def __init__(
+            self,
+            hidden_size=1408,
+            intermediate_size=6144,
+            projection_dim=512,
+            num_hidden_layers=39,
+            num_attention_heads=16,
+            num_channels=3,
+            image_size=224,
+            patch_size=14,
+            hidden_act="gelu",
+            layer_norm_eps=0.00001,
+            dropout=0.0,
+            attention_dropout=0.0,
+            initializer_range=1e-10,
+            initializer_factor=1.0,
+            qkv_bias=True,
+            _flash_attn_2_enabled=True,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+        self._flash_attn_2_enabled = _flash_attn_2_enabled
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the vision config dict if we are loading from HuskyConfig
+        if config_dict.get("model_type") == "husky":
+            config_dict = config_dict["vision_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class HuskyQFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HuskyQFormerModel`]. It is used to
+    instantiate a Husky Querying Transformer (Q-Former) model according to the specified arguments, defining the
+    model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the Husky [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5)
+    architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    Read the documentation from [`PretrainedConfig`] for more information.
+    Note that [`HuskyQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling the model.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+    """
+    model_type = "husky_qformer"
+    def __init__(
+            self,
+            vocab_size=30522,
+            hidden_size=768,
+            num_hidden_layers=12,
+            num_attention_heads=12,
+            intermediate_size=3072,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            initializer_range=0.02,
+            layer_norm_eps=1e-12,
+            pad_token_id=0,
+            position_embedding_type="absolute",
+            classifier_dropout=None,
+            cross_attention_frequency=2,
+            encoder_hidden_size=1408,
+            _flash_attn_2_enabled=True,
+            **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.classifier_dropout = classifier_dropout
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+        self._flash_attn_2_enabled = _flash_attn_2_enabled
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the qformer config dict if we are loading from HuskyConfig
+        if config_dict.get("model_type") == "husky":
+            config_dict = config_dict["qformer_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class HuskyConfig(PretrainedConfig):
+    r"""
+    [`HuskyConfig`] is the configuration class to store the configuration of a
+    [`HuskyForConditionalGeneration`]. It is used to instantiate a Husky model according to the specified
+    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the Husky
+    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`HuskyVisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`HuskyQFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    """
+    model_type = "husky"
+    is_composition = True
+    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+        super().__init__(**kwargs)
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the HuskyVisionConfig with default values.")
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the HuskyQFormerConfig with default values.")
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
+        self.vision_config = HuskyVisionConfig(**vision_config)
+        self.qformer_config = HuskyQFormerConfig(**qformer_config)
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+        self.tie_word_embeddings = self.text_config.tie_word_embeddings
+        self.is_encoder_decoder = self.text_config.is_encoder_decoder
+        self.num_query_tokens = num_query_tokens
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+    @classmethod
+    def from_vision_qformer_text_configs(
+            cls,
+            vision_config: HuskyVisionConfig,
+            qformer_config: HuskyQFormerConfig,
+            text_config: PretrainedConfig,
+            **kwargs,
+    ):
+        r"""
+        Instantiate a [`HuskyConfig`] (or a derived class) from a Husky vision model, Q-Former and
+        language model configurations.
+        Returns:
+            [`HuskyConfig`]: An instance of a configuration object
+        """
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["qformer_config"] = self.qformer_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+if __name__ == '__main__':
+    config = HuskyConfig.from_pretrain

robohusky/model/configuration_husky_ori.py ADDED Viewed

	@@ -0,0 +1,327 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Husky model configuration"""
+import copy
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from transformers.utils import logging
+from transformers.models.auto import CONFIG_MAPPING
+logger = logging.get_logger(__name__)
+HUSKY_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "wofmanaf/husky-7b": "https://huggingface.co/wofmanaf/husky-7b/resolve/main/config.json",
+}
+class HuskyVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HuskyVisionModel`]. It is used to
+    instantiate a Husky vision encoder according to the specified arguments, defining the model architecture.
+    Instantiating a configuration defaults will yield a similar configuration to that of the Husky architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
+            to 1e-5): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+    """
+    model_type = "husky_vision_model"
+    def __init__(
+            self,
+            hidden_size=1408,
+            intermediate_size=6144,
+            projection_dim=512,
+            num_hidden_layers=39,
+            num_attention_heads=16,
+            num_channels=3,
+            image_size=224,
+            patch_size=14,
+            hidden_act="gelu",
+            layer_norm_eps=0.00001,
+            dropout=0.0,
+            attention_dropout=0.0,
+            initializer_range=1e-10,
+            initializer_factor=1.0,
+            qkv_bias=True,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the vision config dict if we are loading from HuskyConfig
+        if config_dict.get("model_type") == "husky":
+            config_dict = config_dict["vision_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class HuskyQFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HuskyQFormerModel`]. It is used to
+    instantiate a Husky Querying Transformer (Q-Former) model according to the specified arguments, defining the
+    model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the Husky [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5)
+    architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    Read the documentation from [`PretrainedConfig`] for more information.
+    Note that [`HuskyQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling the model.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+    """
+    model_type = "husky_qformer"
+    def __init__(
+            self,
+            vocab_size=30522,
+            hidden_size=768,
+            num_hidden_layers=12,
+            num_attention_heads=12,
+            intermediate_size=3072,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            initializer_range=0.02,
+            layer_norm_eps=1e-12,
+            pad_token_id=0,
+            position_embedding_type="absolute",
+            classifier_dropout=None,
+            cross_attention_frequency=2,
+            encoder_hidden_size=1408,
+            **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.classifier_dropout = classifier_dropout
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the qformer config dict if we are loading from HuskyConfig
+        if config_dict.get("model_type") == "husky":
+            config_dict = config_dict["qformer_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class HuskyConfig(PretrainedConfig):
+    r"""
+    [`HuskyConfig`] is the configuration class to store the configuration of a
+    [`HuskyForConditionalGeneration`]. It is used to instantiate a Husky model according to the specified
+    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the Husky
+    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`HuskyVisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`HuskyQFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    """
+    model_type = "husky"
+    is_composition = True
+    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+        super().__init__(**kwargs)
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the HuskyVisionConfig with default values.")
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the HuskyQFormerConfig with default values.")
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
+        self.vision_config = HuskyVisionConfig(**vision_config)
+        self.qformer_config = HuskyQFormerConfig(**qformer_config)
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+        self.tie_word_embeddings = self.text_config.tie_word_embeddings
+        self.is_encoder_decoder = self.text_config.is_encoder_decoder
+        self.num_query_tokens = num_query_tokens
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+    @classmethod
+    def from_vision_qformer_text_configs(
+            cls,
+            vision_config: HuskyVisionConfig,
+            qformer_config: HuskyQFormerConfig,
+            text_config: PretrainedConfig,
+            **kwargs,
+    ):
+        r"""
+        Instantiate a [`HuskyConfig`] (or a derived class) from a Husky vision model, Q-Former and
+        language model configurations.
+        Returns:
+            [`HuskyConfig`]: An instance of a configuration object
+        """
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["qformer_config"] = self.qformer_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+if __name__ == '__main__':
+    config = HuskyConfig.from_pretrain

robohusky/model/modeling_husky.py ADDED Viewed

	@@ -0,0 +1,1820 @@

+# coding=utf-8
+# Copyright 2023 The Salesforce Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Husky model."""
+import contextlib
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers import AutoModelForCausalLM, GenerationConfig
+from .configuration_husky import HuskyConfig, HuskyQFormerConfig, HuskyVisionConfig
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "wofmanaf/husky-7b"
+HUSKY_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "wofmanaf/husky-7b",
+]
+@dataclass
+class HuskyForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Class defining the outputs of [`HuskyForConditionalGeneration`].
+    Args:
+        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Language modeling loss from the language model.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head of the language model.
+        vision_outputs (`BaseModelOutputWithPooling`):
+            Outputs of the vision encoder.
+        qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
+            Outputs of the Q-Former (Querying Transformer).
+        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+            Outputs of the language model.
+    """
+    loss: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
+    vision_outputs: Optional[torch.FloatTensor] = None
+    qformer_outputs: Optional[Tuple[torch.FloatTensor]] = None
+    language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+# Copied from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->Husky
+class HuskyVisionEmbeddings(nn.Module):
+    def __init__(self, config: HuskyVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
+        return embeddings
+class HuskyVideoEmbeddings(nn.Module):
+    def __init__(self, config: HuskyVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.num_frames = getattr(self.config, "num_frames", 8)
+        self.frame_stride = getattr(self.config, "frame_stride", 2)
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+        self.patch_embedding = nn.Conv3d(
+            in_channels=3, out_channels=self.embed_dim,
+            kernel_size=(self.frame_stride, self.patch_size, self.patch_size),
+            stride=(self.frame_stride, self.patch_size, self.patch_size)
+        )
+        self.num_patches = int(self.num_frames // self.frame_stride) * (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
+        return embeddings
+class HuskyAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim ** -0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+        # small tweak here compared to CLIP, no bias here
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False)
+        if config.qkv_bias:
+            q_bias = nn.Parameter(torch.zeros(self.embed_dim))
+            v_bias = nn.Parameter(torch.zeros(self.embed_dim))
+        else:
+            q_bias = None
+            v_bias = None
+        if q_bias is not None:
+            qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
+            self.qkv.bias = nn.Parameter(qkv_bias)
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            head_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        mixed_qkv = self.qkv(hidden_states)
+        mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
+            2, 0, 3, 1, 4
+        )
+        query_states, key_states, value_states = (
+            mixed_qkv[0],
+            mixed_qkv[1],
+            mixed_qkv[2],
+        )
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+        attention_scores = attention_scores * self.scale
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)
+        new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+        output = self.projection(context_layer)
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+        return outputs
+# Copied from transformers.models.blip.modeling_blip.BlipMLP
+class HuskyMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+# Copied from transformers.models.blip.modeling_blip.BlipEncoderLayer with Blip->Husky
+class HuskyEncoderLayer(nn.Module):
+    def __init__(self, config: HuskyConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = HuskyAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = HuskyMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: torch.Tensor,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = hidden_states + residual
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class HuskyPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = HuskyConfig
+    base_model_prefix = "husky"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids",
+        r"language_model.encoder.embed_tokens.weight",
+        r"language_model.decoder.embed_tokens.weight",
+        r"language_model.lm_head.weight",
+    ]
+    _no_split_modules = ["HuskyAttention", "LlamaDecoderLayer", "LlamaForCausalLM"]
+    _skip_keys_device_placement = "past_key_values"
+    _keep_in_fp32_modules = ["wo"]
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+        if isinstance(module, HuskyVisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
+            nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, HuskyEncoder):
+            module.gradient_checkpointing = value
+Husky_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`HuskyConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+Husky_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`HuskyProcessor`]. See [`HuskyProcessor.__call__`] for
+            details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+Husky_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
+            Training](./t5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+Husky_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`HuskyProcessor`]. See [`HuskyProcessor.__call__`] for
+            details.
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
+            provided to serve as text prompt, which the language model can continue.
+            Indices can be obtained using [`HuskyProcessor`]. See [`HuskyProcessor.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an
+            encoder-decoder language model (like T5) is used.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids)
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+            Only relevant in case an encoder-decoder language model (like T5) is used.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->Husky
+class HuskyEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`HuskyEncoderLayer`].
+    Args:
+        config (`HuskyConfig`):
+            The corresponding vision configuration for the `HuskyEncoder`.
+    """
+    def __init__(self, config: HuskyConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([HuskyEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+            self,
+            inputs_embeds,
+            attention_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+# Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->Husky, BLIP->Husky
+class HuskyVisionModel(HuskyPreTrainedModel):
+    main_input_name = "pixel_values"
+    config_class = HuskyVisionConfig
+    def __init__(self, config: HuskyVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = HuskyVisionEmbeddings(config)
+        self.video_embeddings = HuskyVideoEmbeddings(config)
+        self.encoder = HuskyEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.post_init()
+    @add_start_docstrings_to_model_forward(Husky_VISION_INPUTS_DOCSTRING)
+    # @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=HuskyVisionConfig)
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        if len(pixel_values.shape) == 4:
+            hidden_states = self.embeddings(pixel_values)
+        elif len(pixel_values.shape) == 5:
+            hidden_states = self.video_embeddings(pixel_values)
+        else:
+            raise ValueError(f"wrong pixel_values size: {pixel_values.shape}")
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+    def get_input_embeddings(self):
+        return self.embeddings
+    def get_video_embeddings(self):
+        return self.video_embeddings
+class HuskyQFormerMultiHeadAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+    def get_attention_map(self):
+        return self.attention_map
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=None,
+            output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        mixed_query_layer = self.query(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        outputs = outputs + (past_key_value,)
+        return outputs
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->HuskyQFormer
+class HuskyQFormerSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class HuskyQFormerAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = HuskyQFormerMultiHeadAttention(config, is_cross_attention)
+        self.output = HuskyQFormerSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->HuskyQFormer
+class HuskyQFormerIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->HuskyQFormer
+class HuskyQFormerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class HuskyQFormerLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = HuskyQFormerAttention(config)
+        self.layer_idx = layer_idx
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = HuskyQFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+        self.intermediate_query = HuskyQFormerIntermediate(config)
+        self.output_query = HuskyQFormerOutput(config)
+    def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=None,
+            output_attentions=False,
+            query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+        present_key_value = self_attention_outputs[-1]
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+        outputs = outputs + (present_key_value,)
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+class HuskyQFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [HuskyQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_values=None,
+            use_cache=None,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+            query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class HuskyQFormerModel(HuskyPreTrainedModel):
+    """
+    Querying Transformer (Q-Former), used in Husky.
+    """
+    def __init__(self, config: HuskyQFormerConfig):
+        super().__init__(config)
+        self.config = config
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.encoder = HuskyQFormerEncoder(config)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def get_extended_attention_mask(
+            self,
+            attention_mask: torch.Tensor,
+            input_shape: Tuple[int],
+            device: torch.device,
+            has_query: bool = False,
+    ) -> torch.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+            device (`torch.device`):
+                The device of the input to the model.
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+    def forward(
+            self,
+            query_embeds,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_values=None,
+            use_cache=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, `optional`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+        embedding_output = self.layernorm(query_embeds)
+        embedding_output = self.dropout(embedding_output)
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+class AdapterMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN["silu"]
+        hidden_size = config.vision_config.hidden_size
+        intermediate_size = hidden_size // 4
+        output_size = config.qformer_config.hidden_size
+        self.fc1 = nn.Linear(hidden_size, intermediate_size)
+        self.fc2 = nn.Linear(intermediate_size, output_size)
+        # nn.init.trunc_normal_(self.fc1.weight, std=0.02)
+        # nn.init.trunc_normal_(self.fc2.weight, std=0.02)
+        # nn.init.constant_(self.fc1.bias, 0)
+        # nn.init.constant_(self.fc2.bias, 0)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+@add_start_docstrings(
+    """
+    Husky Model for generating text and image features. The model consists of a vision encoder, Querying Transformer
+    (Q-Former) and a language model.
+    """,
+    Husky_START_DOCSTRING,
+)
+class HuskyModel(HuskyPreTrainedModel):
+    config_class = HuskyConfig
+    main_input_name = "pixel_values"
+    def __init__(self, config: HuskyConfig):
+        super().__init__(config)
+        self.vision_model = HuskyVisionModel(config.vision_config)
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = HuskyQFormerModel(config.qformer_config)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        self.config.hidden_size = config.text_config.hidden_size
+        self.num_queries = config.num_query_tokens
+        self.offset = 5
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+    @add_start_docstrings_to_model_forward(Husky_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            text_outputs (`CausalLMOutputWithPast`, or `tuple(torch.FloatTensor)` if `return_dict=False`):
+                The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
+                contains the language model logits, the past key values and the hidden states if
+                `output_hidden_states=True`.
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        text_outputs = self.language_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return text_outputs
+    @add_start_docstrings_to_model_forward(Husky_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return vision_outputs
+    @add_start_docstrings_to_model_forward(Husky_INPUTS_DOCSTRING)
+    def get_qformer_features(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `torch.FloatTensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return query_outputs
+    @add_start_docstrings_to_model_forward(Husky_INPUTS_DOCSTRING)
+    # @replace_return_docstrings(output_type=HuskyForConditionalGenerationModelOutput, config_class=HuskyVisionConfig)
+    def forward(
+            self,
+            pixel_values: torch.FloatTensor,
+            input_ids: torch.FloatTensor,
+            attention_mask: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            labels: Optional[torch.LongTensor] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, HuskyForConditionalGenerationModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0]
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        assert language_model_inputs.shape[1] == self.num_queries
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        # Human: <img><IMAGE></img>. Give the describe Assistant:
+        # position of <image>: [offset: offset+num_queries]
+        inputs_embeds[:, self.offset:self.offset + self.num_queries, :] = language_model_inputs
+        if attention_mask is None:
+            attention_mask = torch.ones_like(
+                input_ids, dtype=torch.long, device=language_model_inputs.device)
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs.logits if return_dict else outputs[0]
+        loss = None
+        # we compute the loss here since we need to take into account the sequence length of the query embeds
+        if labels is not None:
+            labels = labels.to(logits.device)
+            logits = logits[:, -labels.size(1):, :]
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous().to(logits.device)
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(reduction="mean")
+            loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1))
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+        return HuskyForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+@add_start_docstrings(
+    """
+    Husky Model for generating text given an image and an optional text prompt. The model consists of a vision
+    encoder, Querying Transformer (Q-Former) and a language model.
+    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
+    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
+    """,
+    Husky_START_DOCSTRING,
+)
+class HuskyForConditionalGeneration(HuskyPreTrainedModel):
+    config_class = HuskyConfig
+    main_input_name = "pixel_values"
+    def __init__(self, config: HuskyConfig):
+        super().__init__(config)
+        self.vision_model = HuskyVisionModel(config.vision_config)
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = HuskyQFormerModel(config.qformer_config)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        self.config.hidden_size = config.text_config.hidden_size
+        self.num_queries = config.num_query_tokens
+        self.offset = 5
+        self.vision_adapter = AdapterMLP(config)
+        self.layer_norms = nn.ModuleList()
+        for i in range(4):
+            self.layer_norms.append(
+                nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps)
+            )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def extract_feature(
+            self,
+            pixel_values: torch.FloatTensor,
+    ):
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_hidden_states=True,
+        )
+        image_embeds = vision_outputs[0]
+        depth = len(vision_outputs[2])
+        indices = range(depth // 4 - 1, depth, depth // 4)
+        pooled_outputs = []
+        for idx, layer_norm in zip(indices, self.layer_norms):
+            pool_output = vision_outputs[2][idx][:, 0, :].unsqueeze(1)
+            pool_output = layer_norm(pool_output)
+            pooled_outputs.append(pool_output)
+        pooled_outputs = torch.cat(pooled_outputs, dim=1)
+        pooled_outputs = self.vision_adapter(pooled_outputs)
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask
+        )
+        query_output = query_outputs[0]
+        query_output = torch.cat([query_output, pooled_outputs], dim=1)
+        language_model_inputs = self.language_projection(query_output)
+        return language_model_inputs
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+    def _preprocess_accelerate(self):
+        r"""
+        Some pre-processing hacks to make the model `accelerate` compatible. Check
+        https://github.com/huggingface/transformers/pull/21707 for more details.
+        """
+        hf_device_map = self.hf_device_map
+        if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
+            # warn users about unexpected behavior when using multi-GPU + Husky + `accelerate`.
+            logger.warning(
+                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
+                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
+                " Please pass a `device_map` that contains `language_model` to remove this warning."
+                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
+                " more details on creating a `device_map` for large models.",
+            )
+        if hasattr(self.language_model, "_hf_hook"):
+            self.language_model._hf_hook.io_same_device = True  # For `generate` compatibility
+    @add_start_docstrings_to_model_forward(Husky_INPUTS_DOCSTRING)
+    # @replace_return_docstrings(output_type=HuskyForConditionalGenerationModelOutput, config_class=HuskyVisionConfig)
+    def forward(
+            self,
+            pixel_values: torch.FloatTensor,
+            input_ids: torch.FloatTensor,
+            attention_mask: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            labels: Optional[torch.LongTensor] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, HuskyForConditionalGenerationModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        batch_size = input_ids.shape[0]
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+        depth = len(vision_outputs[2])
+        indices = range(depth // 4 - 1, depth, depth // 4)
+        pooled_outputs = []
+        for idx, layer_norm in zip(indices, self.layer_norms):
+            pool_output = vision_outputs[2][idx][:, 0, :].unsqueeze(1)
+            pool_output = layer_norm(pool_output)
+            pooled_outputs.append(pool_output)
+        pooled_outputs = torch.cat(pooled_outputs, dim=1)
+        pooled_outputs = self.vision_adapter(pooled_outputs)
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0]
+        query_output = torch.cat([query_output, pooled_outputs], dim=1)
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        # Human: <img></img>. Give the describe Assistant:
+        # position of <image>: [offset: offset+num_queries]
+        # inputs_embeds[:, self.offset:self.offset + self.num_queries, :] = language_model_inputs
+        prefix_embeds = inputs_embeds[:, :self.offset, :]
+        postfix_embeds = inputs_embeds[:, self.offset:, :]
+        inputs_embeds = torch.cat([prefix_embeds, language_model_inputs, postfix_embeds], dim=1)
+        if attention_mask is None:
+            attention_mask = torch.ones_like(
+                inputs_embeds, dtype=torch.long, device=language_model_inputs.device)
+        else:
+            prefix_mask = attention_mask[:, :self.offset]
+            postfix_mask = attention_mask[:, self.offset:]
+            vision_mask = torch.ones(size=(batch_size, self.num_queries + 4), dtype=torch.long,
+                                     device=attention_mask.device)
+            attention_mask = torch.cat([prefix_mask, vision_mask, postfix_mask], dim=-1)
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs.logits if return_dict else outputs[0]
+        loss = None
+        # we compute the loss here since we need to take into account the sequence length of the query embeds
+        if labels is not None:
+            labels = labels.to(logits.device)
+            logits = logits[:, -labels.size(1):, :]
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous().to(logits.device)
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(reduction="mean")
+            loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1))
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+        return HuskyForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+    @torch.no_grad()
+    def generate(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.LongTensor] = None,
+            language_model_inputs: Optional[torch.FloatTensor] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            **generate_kwargs,
+    ) -> torch.LongTensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+        Args:
+            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices
+                        language_model_inputs (`torch.LongTensor` of shape (batch_size, sequence_length, num_channel), *optional*):
+                The sequence used as the input for the generation
+            language_model_inputs (`torch.LongTensor` of shape (batch_size, sequence_length, num_channel), *optional*):
+                The sequence used as the input for the generation
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        if hasattr(self, "hf_device_map"):
+            # preprocess for `accelerate`
+            self._preprocess_accelerate()
+        if language_model_inputs is None:
+            vision_outputs = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=True,
+            )
+            image_embeds = vision_outputs[0]
+            depth = len(vision_outputs[2])
+            indices = range(depth // 4 - 1, depth, depth // 4)
+            pooled_outputs = []
+            for idx, layer_norm in zip(indices, self.layer_norms):
+                pool_output = vision_outputs[2][idx][:, 0, :].unsqueeze(1)
+                pool_output = layer_norm(pool_output)
+                pooled_outputs.append(pool_output)
+            pooled_outputs = torch.cat(pooled_outputs, dim=1)
+            pooled_outputs = self.vision_adapter(pooled_outputs)
+            image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+            query_outputs = self.qformer(
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_attention_mask,
+            )
+            query_output = query_outputs[0]
+            query_output = torch.cat([query_output, pooled_outputs], dim=1)
+            language_model_inputs = self.language_projection(query_output)
+        batch_size = language_model_inputs.shape[0]
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        prefix_embeds = inputs_embeds[:, :self.offset, :]
+        postfix_embeds = inputs_embeds[:, self.offset:, :]
+        inputs_embeds = torch.cat([prefix_embeds, language_model_inputs, postfix_embeds], dim=1)
+        if input_ids is None:
+            input_ids = (
+                torch.LongTensor([[self.config.text_config.bos_token_id]])
+                .repeat(batch_size, 1)
+                .to(inputs_embeds.device)
+            )
+        if attention_mask is None:
+            attention_mask = torch.ones_like(
+                input_ids, dtype=torch.long, device=language_model_inputs.device)
+        else:
+            prefix_mask = attention_mask[:, :self.offset]
+            postfix_mask = attention_mask[:, self.offset:]
+            vision_mask = torch.ones(size=(batch_size, self.num_queries + 4), dtype=torch.long,
+                                     device=attention_mask.device)
+            attention_mask = torch.cat([prefix_mask, vision_mask, postfix_mask], dim=-1)
+        outputs = self.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            **generate_kwargs,
+        )
+        return outputs

robohusky/model/modeling_husky_embody2.py ADDED Viewed

	@@ -0,0 +1,1962 @@

+# coding=utf-8
+# Copyright 2023 The Salesforce Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Husky model."""
+import contextlib
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    is_flash_attn_available
+)
+from transformers import AutoModelForCausalLM, GenerationConfig
+from .configuration_husky import HuskyConfig, HuskyQFormerConfig, HuskyVisionConfig
+if is_flash_attn_available():
+    from flash_attn import flash_attn_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+try:
+    from apex.normalization import FusedLayerNorm as LayerNorm
+except ImportError:
+    from torch.nn import LayerNorm as LayerNorm
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "wofmanaf/husky-7b"
+HUSKY_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "wofmanaf/husky-7b",
+]
+@dataclass
+class HuskyForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Class defining the outputs of [`HuskyForConditionalGeneration`].
+    Args:
+        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Language modeling loss from the language model.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head of the language model.
+        vision_outputs (`BaseModelOutputWithPooling`):
+            Outputs of the vision encoder.
+        qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
+            Outputs of the Q-Former (Querying Transformer).
+        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+            Outputs of the language model.
+    """
+    loss: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
+    vision_outputs: Optional[torch.FloatTensor] = None
+    qformer_outputs: Optional[Tuple[torch.FloatTensor]] = None
+    language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+# Copied from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->Husky
+class HuskyVisionEmbeddings(nn.Module):
+    def __init__(self, config: HuskyVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
+        return embeddings
+class HuskyVideoEmbeddings(nn.Module):
+    def __init__(self, config: HuskyVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.num_frames = getattr(self.config, "num_frames", 8)
+        self.frame_stride = getattr(self.config, "frame_stride", 2)
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+        self.patch_embedding = nn.Conv3d(
+            in_channels=3, out_channels=self.embed_dim,
+            kernel_size=(self.frame_stride, self.patch_size, self.patch_size),
+            stride=(self.frame_stride, self.patch_size, self.patch_size)
+        )
+        self.num_patches = int(self.num_frames // self.frame_stride) * (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
+        return embeddings
+class HuskyAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim ** -0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+        # small tweak here compared to CLIP, no bias here
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False)
+        if config.qkv_bias:
+            q_bias = nn.Parameter(torch.zeros(self.embed_dim))
+            v_bias = nn.Parameter(torch.zeros(self.embed_dim))
+        else:
+            q_bias = None
+            v_bias = None
+        if q_bias is not None:
+            qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
+            self.qkv.bias = nn.Parameter(qkv_bias)
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+    def _shape(self, tensor: torch.Tensor, bsz: int, seq_len: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            head_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        mixed_qkv = self.qkv(hidden_states)
+        mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
+            2, 0, 3, 1, 4
+        )
+        query_states, key_states, value_states = (
+            mixed_qkv[0],
+            mixed_qkv[1],
+            mixed_qkv[2],
+        )
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+        attention_scores = attention_scores * self.scale
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)
+        new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+        output = self.projection(context_layer)
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+        return outputs
+class HuskyFlashAttention2(HuskyAttention):
+    """
+    Husky flash attention module. This module inherits from `HuskyAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            head_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> tuple[Any, None]:
+        # HuskyFlashAttention2 does not support output_attentions
+        assert output_attentions is False
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        mixed_qkv = self.qkv(hidden_states)
+        # Flash attention requires the input to have the shape batch_size x seq_len x num_heads x head_dim
+        # therefore we just need to keep the original shape
+        mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
+            2, 0, 1, 3, 4
+        )
+        query_states, key_states, value_states = (
+            mixed_qkv[0],
+            mixed_qkv[1],
+            mixed_qkv[2],
+        )
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            # Handle the case where the model is quantized
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.qkv.weight.dtype
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = flash_attn_func(
+            query_states, key_states, value_states
+        )
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim).contiguous()
+        output = self.projection(attn_output)
+        outputs = (output, None)
+        return outputs
+class HuskyMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+# Copied from transformers.models.blip.modeling_blip.BlipEncoderLayer with Blip->Husky
+class HuskyEncoderLayer(nn.Module):
+    def __init__(self, config: HuskyConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = (
+            HuskyAttention(config=config)
+            if not getattr(config, "_flash_attn_2_enabled", False)
+            else HuskyFlashAttention2(config=config)
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = HuskyMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: torch.Tensor,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = hidden_states + residual
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class HuskyPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = HuskyConfig
+    base_model_prefix = "husky"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids",
+        r"language_model.encoder.embed_tokens.weight",
+        r"language_model.decoder.embed_tokens.weight",
+        r"language_model.lm_head.weight",
+    ]
+    _no_split_modules = [
+        "HuskyAttention",
+        "HuskyFlashAttention2",
+        "LlamaDecoderLayer",
+    ]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _keep_in_fp32_modules = ["wo"]
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+        if isinstance(module, HuskyVisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
+            nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, HuskyEncoder):
+            module.gradient_checkpointing = value
+Husky_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`HuskyConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+Husky_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`HuskyProcessor`]. See [`HuskyProcessor.__call__`] for
+            details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+Husky_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
+            Training](./t5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+Husky_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`HuskyProcessor`]. See [`HuskyProcessor.__call__`] for
+            details.
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
+            provided to serve as text prompt, which the language model can continue.
+            Indices can be obtained using [`HuskyProcessor`]. See [`HuskyProcessor.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an
+            encoder-decoder language model (like T5) is used.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids)
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+            Only relevant in case an encoder-decoder language model (like T5) is used.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->Husky
+class HuskyEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`HuskyEncoderLayer`].
+    Args:
+        config (`HuskyConfig`):
+            The corresponding vision configuration for the `HuskyEncoder`.
+    """
+    def __init__(self, config: HuskyConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([HuskyEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+            self,
+            inputs_embeds,
+            attention_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+# Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->Husky, BLIP->Husky
+class HuskyVisionModel(HuskyPreTrainedModel):
+    main_input_name = "pixel_values"
+    config_class = HuskyVisionConfig
+    def __init__(self, config: HuskyVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = HuskyVisionEmbeddings(config)
+        self.video_embeddings = HuskyVideoEmbeddings(config)
+        self.encoder = HuskyEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.post_init()
+    @add_start_docstrings_to_model_forward(Husky_VISION_INPUTS_DOCSTRING)
+    # @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=HuskyVisionConfig)
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError("You have to specify pixel_values or pixel_embeds")
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            elif len(pixel_values.shape) == 5:
+                hidden_states = self.video_embeddings(pixel_values)
+            else:
+                raise ValueError(f"wrong pixel_values size: {pixel_values.shape}")
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+    def get_input_embeddings(self):
+        return self.embeddings
+    def get_video_embeddings(self):
+        return self.video_embeddings
+class HuskyQFormerMultiHeadAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+    def get_attention_map(self):
+        return self.attention_map
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=None,
+            output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        mixed_query_layer = self.query(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        outputs = outputs + (past_key_value,)
+        return outputs
+class HuskyQFormerFlashAttention2(HuskyQFormerMultiHeadAttention):
+    def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=None,
+            output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        mixed_query_layer = self.query(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        past_key_value = (key_layer, value_layer)
+        # original key shape: [batch_size, num_heads, seq_len, head_dim]
+        # flash_attn size: [batch_size, seq_len, num_heads, head_dim]
+        query_layer = query_layer.transpose(1, 2)
+        key_layer = key_layer.transpose(1, 2)
+        value_layer = value_layer.transpose(1, 2)
+        dropout_rate = self.dropout if self.training else 0
+        input_dtype = query_layer.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            else:
+                target_dtype = self.query.weight.dtype
+            query_layer = query_layer.to(target_dtype)
+            key_layer = key_layer.to(target_dtype)
+            value_layer = value_layer.to(target_dtype)
+        attn_output = flash_attn_func(
+            query_layer, key_layer, value_layer, causal=False
+        )
+        context_layer = attn_output.reshape(bsz, tgt_len, self.embed_size).contiguous()
+        outputs = (context_layer,)
+        outputs = outputs + (past_key_value,)
+        return outputs
+class HuskyQFormerSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class HuskyQFormerAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = (
+            HuskyQFormerMultiHeadAttention(config, is_cross_attention)
+            if not getattr(config, "_flash_attn_2_enabled", False)
+            else HuskyQFormerFlashAttention2(config, is_cross_attention)
+        )
+        self.output = HuskyQFormerSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->HuskyQFormer
+class HuskyQFormerIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->HuskyQFormer
+class HuskyQFormerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class HuskyQFormerLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = HuskyQFormerAttention(config)
+        self.layer_idx = layer_idx
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = HuskyQFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+        self.intermediate_query = HuskyQFormerIntermediate(config)
+        self.output_query = HuskyQFormerOutput(config)
+    def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=None,
+            output_attentions=False,
+            query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+        present_key_value = self_attention_outputs[-1]
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+        outputs = outputs + (present_key_value,)
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+class HuskyQFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [HuskyQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_values=None,
+            use_cache=None,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+            query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class HuskyQFormerModel(HuskyPreTrainedModel):
+    """
+    Querying Transformer (Q-Former), used in Husky.
+    """
+    def __init__(self, config: HuskyQFormerConfig):
+        super().__init__(config)
+        self.config = config
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.encoder = HuskyQFormerEncoder(config)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def get_extended_attention_mask(
+            self,
+            attention_mask: torch.Tensor,
+            input_shape: Tuple[int],
+            device: torch.device,
+            has_query: bool = False,
+    ) -> torch.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+            device (`torch.device`):
+                The device of the input to the model.
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+    def forward(
+            self,
+            query_embeds,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_values=None,
+            use_cache=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, `optional`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+        embedding_output = self.layernorm(query_embeds)
+        embedding_output = self.dropout(embedding_output)
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+class AdapterMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN["silu"]
+        hidden_size = config.vision_config.hidden_size
+        intermediate_size = hidden_size // 4
+        output_size = config.qformer_config.hidden_size
+        self.fc1 = nn.Linear(hidden_size, intermediate_size)
+        self.fc2 = nn.Linear(intermediate_size, output_size)
+        self.layernorm = nn.LayerNorm(output_size, eps=config.vision_config.layer_norm_eps)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.layernorm(hidden_states)
+        return hidden_states
+@add_start_docstrings(
+    """
+    Husky Model for generating text and image features. The model consists of a vision encoder, Querying Transformer
+    (Q-Former) and a language model.
+    """,
+    Husky_START_DOCSTRING,
+)
+class HuskyModel(HuskyPreTrainedModel):
+    config_class = HuskyConfig
+    main_input_name = "pixel_values"
+    def __init__(self, config: HuskyConfig):
+        super().__init__(config)
+        self.vision_model = HuskyVisionModel(config.vision_config)
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = HuskyQFormerModel(config.qformer_config)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        self.config.hidden_size = config.text_config.hidden_size
+        self.num_queries = config.num_query_tokens
+        self.offset = 5
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+    @add_start_docstrings_to_model_forward(Husky_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            text_outputs (`CausalLMOutputWithPast`, or `tuple(torch.FloatTensor)` if `return_dict=False`):
+                The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
+                contains the language model logits, the past key values and the hidden states if
+                `output_hidden_states=True`.
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        text_outputs = self.language_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return text_outputs
+    @add_start_docstrings_to_model_forward(Husky_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return vision_outputs
+    @add_start_docstrings_to_model_forward(Husky_INPUTS_DOCSTRING)
+    def get_qformer_features(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `torch.FloatTensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return query_outputs
+    @add_start_docstrings_to_model_forward(Husky_INPUTS_DOCSTRING)
+    # @replace_return_docstrings(output_type=HuskyForConditionalGenerationModelOutput, config_class=HuskyVisionConfig)
+    def forward(
+            self,
+            pixel_values: torch.FloatTensor,
+            input_ids: torch.FloatTensor,
+            attention_mask: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            labels: Optional[torch.LongTensor] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, HuskyForConditionalGenerationModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0]
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        assert language_model_inputs.shape[1] == self.num_queries
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        # Human: <img><IMAGE></img>. Give the describe Assistant:
+        # position of <image>: [offset: offset+num_queries]
+        inputs_embeds[:, self.offset:self.offset + self.num_queries, :] = language_model_inputs
+        if attention_mask is None:
+            attention_mask = torch.ones_like(
+                input_ids, dtype=torch.long, device=language_model_inputs.device)
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs.logits if return_dict else outputs[0]
+        loss = None
+        # we compute the loss here since we need to take into account the sequence length of the query embeds
+        if labels is not None:
+            labels = labels.to(logits.device)
+            logits = logits[:, -labels.size(1):, :]
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous().to(logits.device)
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(reduction="mean")
+            loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1))
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+        return HuskyForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+@add_start_docstrings(
+    """
+    Husky Model for generating text given an image and an optional text prompt. The model consists of a vision
+    encoder, Querying Transformer (Q-Former) and a language model.
+    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
+    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
+    """,
+    Husky_START_DOCSTRING,
+)
+class HuskyForConditionalGeneration(HuskyPreTrainedModel):
+    config_class = HuskyConfig
+    main_input_name = "pixel_values"
+    def __init__(self, config: HuskyConfig):
+        super().__init__(config)
+        self.vision_model = HuskyVisionModel(config.vision_config)
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = HuskyQFormerModel(config.qformer_config)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        self.config.hidden_size = config.text_config.hidden_size
+        self.num_queries = config.num_query_tokens
+        self.offset = 5
+        self.vision_adapter = AdapterMLP(config)
+        self.layer_norms = nn.ModuleList()
+        for i in range(4):
+            self.layer_norms.append(
+                nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps)
+            )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def extract_feature(
+            self,
+            pixel_values: torch.FloatTensor,
+    ):
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_hidden_states=True,
+        )
+        image_embeds = vision_outputs[0]
+        depth = len(vision_outputs[2])
+        indices = range(depth // 4 - 1, depth, depth // 4)
+        pooled_outputs = []
+        for idx, layer_norm in zip(indices, self.layer_norms):
+            pool_output = vision_outputs[2][idx][:, 0, :].unsqueeze(1)
+            pool_output = layer_norm(pool_output)
+            pooled_outputs.append(pool_output)
+        pooled_outputs = torch.cat(pooled_outputs, dim=1)
+        pooled_outputs = self.vision_adapter(pooled_outputs)
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask
+        )
+        query_output = query_outputs[0]
+        # soft_prompting
+        query_output = torch.cat([pooled_outputs, query_output], dim=1)
+        language_model_inputs = self.language_projection(query_output)
+        return language_model_inputs
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+    def _preprocess_accelerate(self):
+        r"""
+        Some pre-processing hacks to make the model `accelerate` compatible. Check
+        https://github.com/huggingface/transformers/pull/21707 for more details.
+        """
+        hf_device_map = self.hf_device_map
+        if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
+            # warn users about unexpected behavior when using multi-GPU + Husky + `accelerate`.
+            logger.warning(
+                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
+                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
+                " Please pass a `device_map` that contains `language_model` to remove this warning."
+                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
+                " more details on creating a `device_map` for large models.",
+            )
+        if hasattr(self.language_model, "_hf_hook"):
+            self.language_model._hf_hook.io_same_device = True  # For `generate` compatibility
+    @add_start_docstrings_to_model_forward(Husky_INPUTS_DOCSTRING)
+    # @replace_return_docstrings(output_type=HuskyForConditionalGenerationModelOutput, config_class=HuskyVisionConfig)
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            input_ids: Optional[torch.FloatTensor] = None,
+            attention_mask: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            labels: Optional[torch.LongTensor] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, HuskyForConditionalGenerationModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        batch_size = input_ids.shape[0]
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+            pixel_embeds=pixel_embeds,
+        )
+        image_embeds = vision_outputs[0]
+        depth = len(vision_outputs[2])
+        indices = range(depth // 4 - 1, depth, depth // 4)
+        pooled_outputs = []
+        for idx, layer_norm in zip(indices, self.layer_norms):
+            pool_output = vision_outputs[2][idx][:, 0, :].unsqueeze(1)
+            pool_output = layer_norm(pool_output)
+            pooled_outputs.append(pool_output)
+        pooled_outputs = torch.cat(pooled_outputs, dim=1)
+        pooled_outputs = self.vision_adapter(pooled_outputs)
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0]
+        query_output = torch.cat([pooled_outputs, query_output], dim=1)  # 36 token
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        # Human: <img></img>. Give the describe Assistant:
+        # position of <image>: [offset: offset+num_queries]
+        prefix_embeds = inputs_embeds[:, :self.offset, :]
+        postfix_embeds = inputs_embeds[:, self.offset:, :]
+        inputs_embeds = torch.cat([prefix_embeds, language_model_inputs, postfix_embeds], dim=1)
+        if attention_mask is None:
+            attention_mask = torch.ones_like(
+                inputs_embeds, dtype=torch.long, device=language_model_inputs.device)
+        else:
+            prefix_mask = attention_mask[:, :self.offset]
+            postfix_mask = attention_mask[:, self.offset:]
+            vision_mask = torch.ones(size=(batch_size, self.num_queries + 4), dtype=torch.long,
+                                     device=attention_mask.device)
+            attention_mask = torch.cat([prefix_mask, vision_mask, postfix_mask], dim=-1)
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs.logits if return_dict else outputs[0]
+        loss = None
+        # we compute the loss here since we need to take into account the sequence length of the query embeds
+        if labels is not None:
+            labels = labels.to(logits.device)
+            logits = logits[:, -labels.size(1):, :]
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous().to(logits.device)
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(reduction="mean")
+            loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1))
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+        return HuskyForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+    @torch.no_grad()
+    def generate(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.LongTensor] = None,
+            language_model_inputs: Optional[torch.FloatTensor] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            **generate_kwargs,
+    ) -> torch.LongTensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+        Args:
+            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices
+                        language_model_inputs (`torch.LongTensor` of shape (batch_size, sequence_length, num_channel), *optional*):
+                The sequence used as the input for the generation
+            language_model_inputs (`torch.LongTensor` of shape (batch_size, sequence_length, num_channel), *optional*):
+                The sequence used as the input for the generation
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        if hasattr(self, "hf_device_map"):
+            # preprocess for `accelerate`
+            self._preprocess_accelerate()
+        if language_model_inputs is None:
+            vision_outputs = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=True,
+            )
+            image_embeds = vision_outputs[0]
+            depth = len(vision_outputs[2])
+            indices = range(depth // 4 - 1, depth, depth // 4)
+            pooled_outputs = []
+            for idx, layer_norm in zip(indices, self.layer_norms):
+                pool_output = vision_outputs[2][idx][:, 0, :].unsqueeze(1)
+                pool_output = layer_norm(pool_output)
+                pooled_outputs.append(pool_output)
+            pooled_outputs = torch.cat(pooled_outputs, dim=1)
+            pooled_outputs = self.vision_adapter(pooled_outputs)
+            image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+            query_outputs = self.qformer(
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_attention_mask,
+            )
+            query_output = query_outputs[0]
+            query_output = torch.cat([pooled_outputs, query_output], dim=1)
+            language_model_inputs = self.language_projection(query_output)
+        batch_size = language_model_inputs.shape[0]
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        prefix_embeds = inputs_embeds[:, :self.offset, :]
+        postfix_embeds = inputs_embeds[:, self.offset:, :]
+        inputs_embeds = torch.cat([prefix_embeds, language_model_inputs, postfix_embeds], dim=1)
+        if input_ids is None:
+            input_ids = (
+                torch.LongTensor([[self.config.text_config.bos_token_id]])
+                .repeat(batch_size, 1)
+                .to(inputs_embeds.device)
+            )
+        if attention_mask is None:
+            attention_mask = torch.ones_like(
+                input_ids, dtype=torch.long, device=language_model_inputs.device)
+        else:
+            prefix_mask = attention_mask[:, :self.offset]
+            postfix_mask = attention_mask[:, self.offset:]
+            vision_mask = torch.ones(size=(batch_size, self.num_queries + 4), dtype=torch.long,
+                                     device=attention_mask.device)
+            attention_mask = torch.cat([prefix_mask, vision_mask, postfix_mask], dim=-1)
+        outputs = self.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            **generate_kwargs,
+        )
+        return outputs

robohusky/model/modeling_husky_embody2_ori.py ADDED Viewed

	@@ -0,0 +1,1821 @@

+# coding=utf-8
+# Copyright 2023 The Salesforce Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Husky model."""
+import contextlib
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers import AutoModelForCausalLM, GenerationConfig
+from .configuration_husky import HuskyConfig, HuskyQFormerConfig, HuskyVisionConfig
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "wofmanaf/husky-7b"
+HUSKY_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "wofmanaf/husky-7b",
+]
+@dataclass
+class HuskyForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Class defining the outputs of [`HuskyForConditionalGeneration`].
+    Args:
+        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Language modeling loss from the language model.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head of the language model.
+        vision_outputs (`BaseModelOutputWithPooling`):
+            Outputs of the vision encoder.
+        qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
+            Outputs of the Q-Former (Querying Transformer).
+        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+            Outputs of the language model.
+    """
+    loss: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
+    vision_outputs: Optional[torch.FloatTensor] = None
+    qformer_outputs: Optional[Tuple[torch.FloatTensor]] = None
+    language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+# Copied from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->Husky
+class HuskyVisionEmbeddings(nn.Module):
+    def __init__(self, config: HuskyVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
+        return embeddings
+class HuskyVideoEmbeddings(nn.Module):
+    def __init__(self, config: HuskyVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.num_frames = getattr(self.config, "num_frames", 8)
+        self.frame_stride = getattr(self.config, "frame_stride", 2)
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+        self.patch_embedding = nn.Conv3d(
+            in_channels=3, out_channels=self.embed_dim,
+            kernel_size=(self.frame_stride, self.patch_size, self.patch_size),
+            stride=(self.frame_stride, self.patch_size, self.patch_size)
+        )
+        self.num_patches = int(self.num_frames // self.frame_stride) * (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
+        return embeddings
+class HuskyAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim ** -0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+        # small tweak here compared to CLIP, no bias here
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False)
+        if config.qkv_bias:
+            q_bias = nn.Parameter(torch.zeros(self.embed_dim))
+            v_bias = nn.Parameter(torch.zeros(self.embed_dim))
+        else:
+            q_bias = None
+            v_bias = None
+        if q_bias is not None:
+            qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
+            self.qkv.bias = nn.Parameter(qkv_bias)
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            head_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        mixed_qkv = self.qkv(hidden_states)
+        mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
+            2, 0, 3, 1, 4
+        )
+        query_states, key_states, value_states = (
+            mixed_qkv[0],
+            mixed_qkv[1],
+            mixed_qkv[2],
+        )
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+        attention_scores = attention_scores * self.scale
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)
+        new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+        output = self.projection(context_layer)
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+        return outputs
+# Copied from transformers.models.blip.modeling_blip.BlipMLP
+class HuskyMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+# Copied from transformers.models.blip.modeling_blip.BlipEncoderLayer with Blip->Husky
+class HuskyEncoderLayer(nn.Module):
+    def __init__(self, config: HuskyConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = HuskyAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = HuskyMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: torch.Tensor,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = hidden_states + residual
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class HuskyPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = HuskyConfig
+    base_model_prefix = "husky"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids",
+        r"language_model.encoder.embed_tokens.weight",
+        r"language_model.decoder.embed_tokens.weight",
+        r"language_model.lm_head.weight",
+    ]
+    _no_split_modules = ["HuskyAttention", "LlamaDecoderLayer", "LlamaForCausalLM"]
+    _skip_keys_device_placement = "past_key_values"
+    _keep_in_fp32_modules = ["wo"]
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+        if isinstance(module, HuskyVisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
+            nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, HuskyEncoder):
+            module.gradient_checkpointing = value
+Husky_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`HuskyConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+Husky_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`HuskyProcessor`]. See [`HuskyProcessor.__call__`] for
+            details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+Husky_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
+            Training](./t5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+Husky_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`HuskyProcessor`]. See [`HuskyProcessor.__call__`] for
+            details.
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
+            provided to serve as text prompt, which the language model can continue.
+            Indices can be obtained using [`HuskyProcessor`]. See [`HuskyProcessor.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an
+            encoder-decoder language model (like T5) is used.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids)
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+            Only relevant in case an encoder-decoder language model (like T5) is used.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->Husky
+class HuskyEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`HuskyEncoderLayer`].
+    Args:
+        config (`HuskyConfig`):
+            The corresponding vision configuration for the `HuskyEncoder`.
+    """
+    def __init__(self, config: HuskyConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([HuskyEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+            self,
+            inputs_embeds,
+            attention_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+# Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->Husky, BLIP->Husky
+class HuskyVisionModel(HuskyPreTrainedModel):
+    main_input_name = "pixel_values"
+    config_class = HuskyVisionConfig
+    def __init__(self, config: HuskyVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = HuskyVisionEmbeddings(config)
+        self.video_embeddings = HuskyVideoEmbeddings(config)
+        self.encoder = HuskyEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.post_init()
+    @add_start_docstrings_to_model_forward(Husky_VISION_INPUTS_DOCSTRING)
+    # @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=HuskyVisionConfig)
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError("You have to specify pixel_values or pixel_embeds")
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            elif len(pixel_values.shape) == 5:
+                hidden_states = self.video_embeddings(pixel_values)
+            else:
+                raise ValueError(f"wrong pixel_values size: {pixel_values.shape}")
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+    def get_input_embeddings(self):
+        return self.embeddings
+    def get_video_embeddings(self):
+        return self.video_embeddings
+class HuskyQFormerMultiHeadAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+    def get_attention_map(self):
+        return self.attention_map
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=None,
+            output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        mixed_query_layer = self.query(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        outputs = outputs + (past_key_value,)
+        return outputs
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->HuskyQFormer
+class HuskyQFormerSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class HuskyQFormerAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = HuskyQFormerMultiHeadAttention(config, is_cross_attention)
+        self.output = HuskyQFormerSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->HuskyQFormer
+class HuskyQFormerIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->HuskyQFormer
+class HuskyQFormerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class HuskyQFormerLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = HuskyQFormerAttention(config)
+        self.layer_idx = layer_idx
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = HuskyQFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+        self.intermediate_query = HuskyQFormerIntermediate(config)
+        self.output_query = HuskyQFormerOutput(config)
+    def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=None,
+            output_attentions=False,
+            query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+        present_key_value = self_attention_outputs[-1]
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+        outputs = outputs + (present_key_value,)
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+class HuskyQFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [HuskyQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_values=None,
+            use_cache=None,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+            query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class HuskyQFormerModel(HuskyPreTrainedModel):
+    """
+    Querying Transformer (Q-Former), used in Husky.
+    """
+    def __init__(self, config: HuskyQFormerConfig):
+        super().__init__(config)
+        self.config = config
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.encoder = HuskyQFormerEncoder(config)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def get_extended_attention_mask(
+            self,
+            attention_mask: torch.Tensor,
+            input_shape: Tuple[int],
+            device: torch.device,
+            has_query: bool = False,
+    ) -> torch.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+            device (`torch.device`):
+                The device of the input to the model.
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+    def forward(
+            self,
+            query_embeds,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_values=None,
+            use_cache=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, `optional`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+        embedding_output = self.layernorm(query_embeds)
+        embedding_output = self.dropout(embedding_output)
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+class AdapterMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN["silu"]
+        hidden_size = config.vision_config.hidden_size
+        intermediate_size = hidden_size // 4
+        output_size = config.qformer_config.hidden_size
+        self.fc1 = nn.Linear(hidden_size, intermediate_size)
+        self.fc2 = nn.Linear(intermediate_size, output_size)
+        self.layernorm = nn.LayerNorm(output_size, eps=config.vision_config.layer_norm_eps)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.layernorm(hidden_states)
+        return hidden_states
+@add_start_docstrings(
+    """
+    Husky Model for generating text and image features. The model consists of a vision encoder, Querying Transformer
+    (Q-Former) and a language model.
+    """,
+    Husky_START_DOCSTRING,
+)
+class HuskyModel(HuskyPreTrainedModel):
+    config_class = HuskyConfig
+    main_input_name = "pixel_values"
+    def __init__(self, config: HuskyConfig):
+        super().__init__(config)
+        self.vision_model = HuskyVisionModel(config.vision_config)
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = HuskyQFormerModel(config.qformer_config)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        self.config.hidden_size = config.text_config.hidden_size
+        self.num_queries = config.num_query_tokens
+        self.offset = 5
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+    @add_start_docstrings_to_model_forward(Husky_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            text_outputs (`CausalLMOutputWithPast`, or `tuple(torch.FloatTensor)` if `return_dict=False`):
+                The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
+                contains the language model logits, the past key values and the hidden states if
+                `output_hidden_states=True`.
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        text_outputs = self.language_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return text_outputs
+    @add_start_docstrings_to_model_forward(Husky_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return vision_outputs
+    @add_start_docstrings_to_model_forward(Husky_INPUTS_DOCSTRING)
+    def get_qformer_features(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `torch.FloatTensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return query_outputs
+    @add_start_docstrings_to_model_forward(Husky_INPUTS_DOCSTRING)
+    # @replace_return_docstrings(output_type=HuskyForConditionalGenerationModelOutput, config_class=HuskyVisionConfig)
+    def forward(
+            self,
+            pixel_values: torch.FloatTensor,
+            input_ids: torch.FloatTensor,
+            attention_mask: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            labels: Optional[torch.LongTensor] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, HuskyForConditionalGenerationModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0]
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        assert language_model_inputs.shape[1] == self.num_queries
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        # Human: <img><IMAGE></img>. Give the describe Assistant:
+        # position of <image>: [offset: offset+num_queries]
+        inputs_embeds[:, self.offset:self.offset + self.num_queries, :] = language_model_inputs
+        if attention_mask is None:
+            attention_mask = torch.ones_like(
+                input_ids, dtype=torch.long, device=language_model_inputs.device)
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs.logits if return_dict else outputs[0]
+        loss = None
+        # we compute the loss here since we need to take into account the sequence length of the query embeds
+        if labels is not None:
+            labels = labels.to(logits.device)
+            logits = logits[:, -labels.size(1):, :]
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous().to(logits.device)
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(reduction="mean")
+            loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1))
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+        return HuskyForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+@add_start_docstrings(
+    """
+    Husky Model for generating text given an image and an optional text prompt. The model consists of a vision
+    encoder, Querying Transformer (Q-Former) and a language model.
+    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
+    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
+    """,
+    Husky_START_DOCSTRING,
+)
+class HuskyForConditionalGeneration(HuskyPreTrainedModel):
+    config_class = HuskyConfig
+    main_input_name = "pixel_values"
+    def __init__(self, config: HuskyConfig):
+        super().__init__(config)
+        self.vision_model = HuskyVisionModel(config.vision_config)
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = HuskyQFormerModel(config.qformer_config)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        self.config.hidden_size = config.text_config.hidden_size
+        self.num_queries = config.num_query_tokens
+        self.offset = 5
+        self.vision_adapter = AdapterMLP(config)
+        self.layer_norms = nn.ModuleList()
+        for i in range(4):
+            self.layer_norms.append(
+                nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps)
+            )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def extract_feature(
+            self,
+            pixel_values: torch.FloatTensor,
+    ):
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_hidden_states=True,
+        )
+        image_embeds = vision_outputs[0]
+        depth = len(vision_outputs[2])
+        indices = range(depth // 4 - 1, depth, depth // 4)
+        pooled_outputs = []
+        for idx, layer_norm in zip(indices, self.layer_norms):
+            pool_output = vision_outputs[2][idx][:, 0, :].unsqueeze(1)
+            pool_output = layer_norm(pool_output)
+            pooled_outputs.append(pool_output)
+        pooled_outputs = torch.cat(pooled_outputs, dim=1)
+        pooled_outputs = self.vision_adapter(pooled_outputs)
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask
+        )
+        query_output = query_outputs[0]
+        # soft_prompting
+        query_output = torch.cat([pooled_outputs, query_output], dim=1)
+        language_model_inputs = self.language_projection(query_output)
+        return language_model_inputs
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+    def _preprocess_accelerate(self):
+        r"""
+        Some pre-processing hacks to make the model `accelerate` compatible. Check
+        https://github.com/huggingface/transformers/pull/21707 for more details.
+        """
+        hf_device_map = self.hf_device_map
+        if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
+            # warn users about unexpected behavior when using multi-GPU + Husky + `accelerate`.
+            logger.warning(
+                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
+                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
+                " Please pass a `device_map` that contains `language_model` to remove this warning."
+                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
+                " more details on creating a `device_map` for large models.",
+            )
+        if hasattr(self.language_model, "_hf_hook"):
+            self.language_model._hf_hook.io_same_device = True  # For `generate` compatibility
+    @add_start_docstrings_to_model_forward(Husky_INPUTS_DOCSTRING)
+    # @replace_return_docstrings(output_type=HuskyForConditionalGenerationModelOutput, config_class=HuskyVisionConfig)
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            input_ids: Optional[torch.FloatTensor] = None,
+            attention_mask: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            labels: Optional[torch.LongTensor] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, HuskyForConditionalGenerationModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        batch_size = input_ids.shape[0]
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+            pixel_embeds=pixel_embeds,
+        )
+        image_embeds = vision_outputs[0]
+        depth = len(vision_outputs[2])
+        indices = range(depth // 4 - 1, depth, depth // 4)
+        pooled_outputs = []
+        for idx, layer_norm in zip(indices, self.layer_norms):
+            pool_output = vision_outputs[2][idx][:, 0, :].unsqueeze(1)
+            pool_output = layer_norm(pool_output)
+            pooled_outputs.append(pool_output)
+        pooled_outputs = torch.cat(pooled_outputs, dim=1)
+        pooled_outputs = self.vision_adapter(pooled_outputs)
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0]
+        query_output = torch.cat([pooled_outputs, query_output], dim=1)  # 36 token
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        # Human: <img></img>. Give the describe Assistant:
+        # position of <image>: [offset: offset+num_queries]
+        prefix_embeds = inputs_embeds[:, :self.offset, :]
+        postfix_embeds = inputs_embeds[:, self.offset:, :]
+        inputs_embeds = torch.cat([prefix_embeds, language_model_inputs, postfix_embeds], dim=1)
+        if attention_mask is None:
+            attention_mask = torch.ones_like(
+                inputs_embeds, dtype=torch.long, device=language_model_inputs.device)
+        else:
+            prefix_mask = attention_mask[:, :self.offset]
+            postfix_mask = attention_mask[:, self.offset:]
+            vision_mask = torch.ones(size=(batch_size, self.num_queries + 4), dtype=torch.long,
+                                     device=attention_mask.device)
+            attention_mask = torch.cat([prefix_mask, vision_mask, postfix_mask], dim=-1)
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs.logits if return_dict else outputs[0]
+        loss = None
+        # we compute the loss here since we need to take into account the sequence length of the query embeds
+        if labels is not None:
+            labels = labels.to(logits.device)
+            logits = logits[:, -labels.size(1):, :]
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous().to(logits.device)
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(reduction="mean")
+            loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1))
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+        return HuskyForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+    @torch.no_grad()
+    def generate(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.LongTensor] = None,
+            language_model_inputs: Optional[torch.FloatTensor] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            **generate_kwargs,
+    ) -> torch.LongTensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+        Args:
+            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices
+                        language_model_inputs (`torch.LongTensor` of shape (batch_size, sequence_length, num_channel), *optional*):
+                The sequence used as the input for the generation
+            language_model_inputs (`torch.LongTensor` of shape (batch_size, sequence_length, num_channel), *optional*):
+                The sequence used as the input for the generation
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        if hasattr(self, "hf_device_map"):
+            # preprocess for `accelerate`
+            self._preprocess_accelerate()
+        if language_model_inputs is None:
+            vision_outputs = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=True,
+            )
+            image_embeds = vision_outputs[0]
+            depth = len(vision_outputs[2])
+            indices = range(depth // 4 - 1, depth, depth // 4)
+            pooled_outputs = []
+            for idx, layer_norm in zip(indices, self.layer_norms):
+                pool_output = vision_outputs[2][idx][:, 0, :].unsqueeze(1)
+                pool_output = layer_norm(pool_output)
+                pooled_outputs.append(pool_output)
+            pooled_outputs = torch.cat(pooled_outputs, dim=1)
+            pooled_outputs = self.vision_adapter(pooled_outputs)
+            image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+            query_outputs = self.qformer(
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_attention_mask,
+            )
+            query_output = query_outputs[0]
+            query_output = torch.cat([pooled_outputs, query_output], dim=1)
+            language_model_inputs = self.language_projection(query_output)
+        batch_size = language_model_inputs.shape[0]
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        prefix_embeds = inputs_embeds[:, :self.offset, :]
+        postfix_embeds = inputs_embeds[:, self.offset:, :]
+        inputs_embeds = torch.cat([prefix_embeds, language_model_inputs, postfix_embeds], dim=1)
+        if input_ids is None:
+            input_ids = (
+                torch.LongTensor([[self.config.text_config.bos_token_id]])
+                .repeat(batch_size, 1)
+                .to(inputs_embeds.device)
+            )
+        if attention_mask is None:
+            attention_mask = torch.ones_like(
+                input_ids, dtype=torch.long, device=language_model_inputs.device)
+        else:
+            prefix_mask = attention_mask[:, :self.offset]
+            postfix_mask = attention_mask[:, self.offset:]
+            vision_mask = torch.ones(size=(batch_size, self.num_queries + 4), dtype=torch.long,
+                                     device=attention_mask.device)
+            attention_mask = torch.cat([prefix_mask, vision_mask, postfix_mask], dim=-1)
+        outputs = self.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            **generate_kwargs,
+        )
+        return outputs

robohusky/model/processing_husky.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Husky. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
+"""
+from typing import List, Optional, Union
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, \
+    TruncationStrategy
+from transformers.utils import TensorType
+from transformers.models.auto import AutoTokenizer
+class HuskyProcessor(ProcessorMixin):
+    r"""
+    Constructs an Husky processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single
+    processor.
+    [`HuskyProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the
+    docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
+    Args:
+        image_processor (`BlipImageProcessor`):
+            An instance of [`BlipImageProcessor`]. The image processor is a required input.
+        tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "BlipImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, image_processor, tokenizer):
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+        # add QFormer tokenizer
+        self.qformer_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", truncation_side="left")
+        self.qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"})
+    def __call__(
+            self,
+            images=None,
+            text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+            add_special_tokens: bool = True,
+            padding: Union[bool, str, PaddingStrategy] = False,
+            truncation: Union[bool, str, TruncationStrategy] = None,
+            max_length: Optional[int] = None,
+            stride: int = 0,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+            return_overflowing_tokens: bool = False,
+            return_special_tokens_mask: bool = False,
+            return_offsets_mapping: bool = False,
+            return_token_type_ids: bool = False,
+            return_length: bool = False,
+            verbose: bool = True,
+            return_tensors: Optional[Union[str, TensorType]] = None,
+            **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`BertTokenizerFast.__call__`] to prepare text for the model.
+        Please refer to the docstring of the above two methods for more information.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify either images or text.")
+        # Get only text
+        if images is None:
+            self.current_processor = self.tokenizer
+            text_encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            return text_encoding
+        # add pixel_values
+        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
+        if text is not None:
+            text_encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            qformer_text_encoding = self.qformer_tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            qformer_text_encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids")
+            qformer_text_encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask")
+            text_encoding.update(qformer_text_encoding)
+        else:
+            text_encoding = None
+        if text_encoding is not None:
+            encoding_image_processor.update(text_encoding)
+        return encoding_image_processor
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

robohusky/train/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

robohusky/train/llama_flash_attn_monkey_patch.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import warnings
+from typing import Optional, Tuple
+import torch
+from flash_attn import __version__ as flash_attn_version
+from flash_attn.bert_padding import pad_input, unpad_input
+from flash_attn.flash_attn_interface import (
+    flash_attn_func,
+    flash_attn_varlen_kvpacked_func,
+)
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    LlamaModel,
+    rotate_half,
+)
+def apply_rotary_pos_emb(q, k, cos_sin, position_ids):
+    gather_indices = position_ids[:, :, None, None]  # [bsz, seq_len, 1, 1]
+    gather_indices = gather_indices.repeat(
+        1, 1, cos_sin[0].shape[1], cos_sin[0].shape[3]
+    )
+    bsz = gather_indices.shape[0]
+    cos, sin = (
+        torch.gather(x.transpose(1, 2).repeat(bsz, 1, 1, 1), 1, gather_indices)
+        for x in cos_sin
+    )
+    q, k = ((x * cos) + (rotate_half(x) * sin) for x in (q, k))
+    return q, k
+def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        padding_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+    bsz, q_len, _ = hidden_states.size()
+    kv_heads = getattr(self, "num_key_value_heads", self.num_heads)
+    q, k, v = (
+        op(hidden_states).view(bsz, q_len, nh, self.head_dim)
+        for op, nh in (
+        (self.q_proj, self.num_heads),
+        (self.k_proj, kv_heads),
+        (self.v_proj, kv_heads),
+    )
+    )
+    # shape: (b, s, num_heads, head_dim)
+    kv_seq_len = k.shape[1]
+    past_kv_len = 0
+    if past_key_value is not None:
+        past_kv_len = past_key_value[0].shape[2]
+        kv_seq_len += past_kv_len
+    cos_sin = self.rotary_emb(v, seq_len=kv_seq_len)
+    q, k = apply_rotary_pos_emb(q, k, cos_sin, position_ids)
+    if past_key_value is not None:
+        assert (
+                flash_attn_version >= "2.1.0"
+        ), "past_key_value support requires flash-attn >= 2.1.0"
+        # reuse k, v
+        k = torch.cat([past_key_value[0].transpose(1, 2), k], dim=1)
+        v = torch.cat([past_key_value[1].transpose(1, 2), v], dim=1)
+    past_key_value = (k.transpose(1, 2), v.transpose(1, 2)) if use_cache else None
+    if attention_mask is None:
+        output = flash_attn_func(q, k, v, 0.0, softmax_scale=None, causal=True).view(
+            bsz, q_len, -1
+        )
+    else:
+        q, indices, cu_q_lens, max_s = unpad_input(q, attention_mask[:, -q_len:])
+        # We can skip concat and call unpad twice but seems better to call unpad only once.
+        kv, _, cu_k_lens, max_k = unpad_input(
+            torch.stack((k, v), dim=2), attention_mask
+        )
+        output_unpad = flash_attn_varlen_kvpacked_func(
+            q,
+            kv,
+            cu_q_lens,
+            cu_k_lens,
+            max_s,
+            max_k,
+            0.0,
+            softmax_scale=None,
+            causal=True,
+        )
+        output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
+        output = pad_input(output_unpad, indices, bsz, q_len)
+    return self.o_proj(output), None, past_key_value
+# Disable the transformation of the attention mask in LlamaModel as flash attention
+# takes a boolean key_padding_mask. Fills in the past kv length for use in forward.
+def _prepare_decoder_attention_mask(
+        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    if past_key_values_length > 0 and attention_mask is not None:
+        attention_mask = torch.cat(
+            (
+                torch.full(
+                    (input_shape[0], past_key_values_length),
+                    True,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                ),
+                attention_mask,
+            ),
+            dim=-1,
+        )
+    if attention_mask is not None and torch.all(attention_mask):
+        return None  # This uses the faster call when training with full samples
+    return attention_mask
+def replace_llama_attn_with_flash_attn():
+    cuda_major, cuda_minor = torch.cuda.get_device_capability()
+    if cuda_major < 8:
+        warnings.warn(
+            "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
+            "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
+        )
+    LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
+    LlamaAttention.forward = forward
+def test():
+    from robohusky.train.llama_flash_attn_monkey_patch import forward as fastchat_forward
+    from transformers.models.llama.configuration_llama import LlamaConfig
+    config = LlamaConfig(
+        hidden_size=1024,
+        intermediate_size=128,
+        num_hidden_layers=1,
+        num_attention_heads=8,
+        max_position_embeddings=16,
+    )
+    device = torch.device("cuda")
+    model = LlamaModel(config)
+    attn = LlamaAttention(config).to(device).half()
+    bsz, hs, seqlen = 2, config.hidden_size, config.max_position_embeddings
+    position_ids = torch.arange(seqlen, dtype=torch.long, device=device).view(
+        -1, seqlen
+    )
+    mask = torch.full((bsz, seqlen), True, dtype=torch.bool, device=device)
+    for i in range(4):
+        hidden = torch.rand((bsz, seqlen, hs), dtype=torch.float16, device=device)
+        if i:
+            mask[0, -i:] = False
+            mask[1, :i] = False
+        lmask = model._prepare_decoder_attention_mask(mask, hidden.shape[:2], hidden, 0)
+        ref, _, _ = attn.forward(
+            hidden, attention_mask=lmask, position_ids=position_ids
+        )
+        fast, _, _ = fastchat_forward(
+            attn, hidden, attention_mask=mask, position_ids=position_ids
+        )
+        lmask = _prepare_decoder_attention_mask(
+            model, mask, hidden.shape[:2], hidden, 0
+        )
+        test, _, _ = forward(
+            attn, hidden, attention_mask=lmask, position_ids=position_ids
+        )
+        print(f"Mean(abs(ref)) = {torch.mean(torch.abs(ref))}")
+        print(f"Mean(abs(ref - fast)) = {torch.mean(torch.abs(ref - fast))}")
+        print(f"Mean(abs(ref - test)) = {torch.mean(torch.abs(ref - test))}")
+        print(f"Mean(abs(fast - test)) = {torch.mean(torch.abs(fast - test))}")
+        print(f"allclose(fast, test) = {torch.allclose(fast, test)}")
+    with torch.no_grad():
+        # Also check that past_kv is handled properly
+        hidden = torch.rand((bsz, seqlen, hs), dtype=torch.float16, device=device)
+        part_len = seqlen // 4
+        assert part_len * 4 == seqlen
+        mask = torch.full((bsz, seqlen), True, dtype=torch.bool, device=device)
+        mask[0, -2:] = False
+        lmask = _prepare_decoder_attention_mask(
+            model, mask, hidden.shape[:2], hidden, 0
+        )
+        oneshot, _, _ = forward(
+            attn, hidden, attention_mask=lmask, position_ids=position_ids
+        )
+        parts = []
+        past_kv, past_kv_len = None, 0
+        for i in range(4):
+            start = part_len * i
+            end = start + part_len
+            hidden_part = hidden[:, start:end, ...]
+            lmask = _prepare_decoder_attention_mask(
+                model,
+                mask[:, start:end],
+                hidden_part.shape[:2],
+                hidden_part,
+                past_kv_len,
+            )
+            part, _, past_kv = forward(
+                attn,
+                hidden_part.clone(),
+                attention_mask=lmask,
+                position_ids=position_ids[:, start:end],
+                past_key_value=past_kv,
+                use_cache=True,
+            )
+            parts.append(part)
+            past_kv_len = past_kv[0].shape[2]
+        print(
+            f"allclose(oneshot[:, 0], parts[0]) = {torch.allclose(oneshot[:, :part_len], parts[0])}"
+        )
+        print(
+            f"allclose(oneshot, parts) = {torch.allclose(oneshot, torch.cat(parts, dim=1))}"
+        )
+if __name__ == "__main__":
+    test()

robohusky/train/llama_rmsnorm_monkey_patch.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import transformers
+def replace_llama_rmsnorm_with_fused_rmsnorm():
+    try:
+        from apex.normalization import FusedRMSNorm
+        from functools import partial
+        LlamaRMSNorm = partial(FusedRMSNorm, eps=1e-6)   # noqa
+        transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
+        print("Discovered apex.normalization.FusedRMSNorm - will use it instead of LlamaRMSNorm")
+    except ImportError:
+        # using the normal LlamaRMSNorm
+        pass
+    except Exception:
+        print("discovered apex but it failed to load, falling back to LlamaRMSNorm")
+        pass

robohusky/train/train.py ADDED Viewed

	@@ -0,0 +1,597 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright Qing-Long Zhang. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for sequence to sequence.
+"""
+import json
+import logging
+import os
+import sys
+import warnings
+from functools import partial
+from multiprocessing import cpu_count
+from typing import Optional
+from dataclasses import dataclass, field
+from torch.utils.data import Dataset, ConcatDataset
+from datasets import load_dataset, load_from_disk
+from robohusky.dist_utils import init_dist
+from robohusky.model.modeling_husky_embody2 import HuskyForConditionalGeneration
+import transformers
+from transformers import (
+    HfArgumentParser,
+    TrainingArguments,
+    LlamaTokenizer,
+    Trainer,
+    set_seed,
+    default_data_collator,
+    DataCollatorForSeq2Seq,
+)
+from peft import (
+    LoraConfig,
+    get_peft_model,
+    prepare_model_for_int8_training,
+)
+from robohusky.base_dataset import (
+    process_func,
+    BaseDataset,
+    CephDataset,
+    build_transform
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+from transformers.utils.logging import (
+    set_verbosity_info,
+    set_verbosity,
+    enable_default_handler,
+    enable_explicit_format,
+)
+from robohusky.train.llama_flash_attn_monkey_patch import (
+    replace_llama_attn_with_flash_attn
+)
+from robohusky.train.llama_rmsnorm_monkey_patch import (
+    replace_llama_rmsnorm_with_fused_rmsnorm
+)
+replace_llama_attn_with_flash_attn()
+replace_llama_rmsnorm_with_fused_rmsnorm()
+IGNORE_INDEX = -100
+DEFAULT_UNK_TOKEN = "<unk>"
+DEFAULT_IMG_START_TOKEN = "<img>"
+DEFAULT_IMG_END_TOKEN = "</img>"
+DEFAULT_VIDEO_START_TOKEN = "<vid>"
+DEFAULT_VIDEO_END_TOKEN = "</vid>"
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.32.0.dev0")
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
+warnings.filterwarnings('ignore')
+logger = logging.getLogger(__name__)
+os.environ["WANDB_DISABLED"] = "true"
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=False,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
+                "with private models)."
+            )
+        },
+    )
+    freeze_model: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
+    )
+    freeze_vision_model: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained vision model whose head dimensions are different."},
+    )
+    freeze_vision_adapter: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained vision adapter whose head dimensions are different."},
+    )
+    freeze_text_model: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained text model whose head dimensions are different."},
+    )
+    freeze_qformer: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained qformer model whose head dimensions are different."},
+    )
+    un_freeze_vision_embedding: bool = field(
+        default=False,
+        metadata={"help": "Will enable to tuning image patch_embedding when vision_model are frozen"},
+    )
+    un_freeze_video_embedding: bool = field(
+        default=False,
+        metadata={"help": "Will enable to tuning video patch_embedding when vision_model are frozen"},
+    )
+    un_freeze_llm_head: bool = field(
+        default=False,
+        metadata={"help": "Will enable to tuning video patch_embedding when vision_model are frozen"},
+    )
+    use_lora: bool = field(
+        default=False, metadata={"help": "add the LoRA adapters to the base model"}
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    data_dir: Optional[str] = field(
+        default=None, metadata={"help": "The data directory containing input files."})
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a jsonlines)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input evaluation data file to evaluate the metrics (sacrebleu) on a jsonlines file."
+        },
+    )
+    train_val_split: Optional[float] = field(
+        default=0.0, metadata={"help": "Percent to split off of train for validation."}
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input test data file to evaluate the metrics (sacrebleu) on a jsonlines file."},
+    )
+    image_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional image path"},
+    )
+    video_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional video path"},
+    )
+    input_size: Optional[int] = field(
+        default=224,
+        metadata={"help": "The input size of images."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_seq_length: Optional[int] = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to pad all samples to model maximum sentence length. "
+                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+                "efficient on GPU but very bad for TPU."
+            )
+        },
+    )
+    val_max_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
+                "during ``evaluate`` and ``predict``."
+            )
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+                "value if set."
+            )
+        },
+    )
+    conv_style: Optional[str] = field(
+        default=None, metadata={"help": "prompt style for a conversation."}
+    )
+    save_data_path: Optional[str] = field(
+        default=None, metadata={"help": "prompt style for a conversation."}
+    )
+    num_beams: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
+                "which is used during ``evaluate`` and ``predict``."
+            )
+        },
+    )
+    ignore_pad_token_for_loss: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
+        },
+    )
+    source_prefix: Optional[str] = field(
+        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
+    )
+    forced_bos_token: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to force as the first generated token after the :obj:`decoder_start_token_id`.Useful for"
+                " multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token needs to"
+                " be the target language token.(Usually it is the target language token)"
+            )
+        },
+    )
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        # accepting both json and jsonl file extensions, as
+        # many jsonlines files actually have a .json extension
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "jsonl", "parquet"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "jsonl",
+                                     "parquet"], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension == "json", "`test_file` should be a json file."
+def main():
+    # 1. Parse input arguments
+    # See all possible arguments in src/transformers/training_args.py
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    init_dist(launcher='slurm', backend='nccl', port=29598)
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script, and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # 2. Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    set_verbosity(log_level)
+    enable_default_handler()
+    enable_explicit_format()
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # 3. Detecting last checkpoint and eventually continue from last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # 4. Get the datasets
+    # you can either provide your own JSON training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        ds = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            data_dir=data_args.data_dir,
+            cache_dir=model_args.cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        # ds = load_dataset(
+        #     "json" if extension == "jsonl" else extension,
+        #     data_files=data_files,
+        #     split="train"
+        # )
+        ds = json.load(open(data_args.train_file, "r"))
+    # 5. Load pretrained model, tokenizer, and image processor
+    #
+    # Distributed training: The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    tokenizer = LlamaTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        legacy=True,
+    )
+    # add special token
+    tokenizer.pad_token_id = 0
+    if tokenizer.unk_token is None:
+        tokenizer.add_special_tokens({"unk_token": DEFAULT_UNK_TOKEN})
+    tokens_list = [
+        DEFAULT_IMG_START_TOKEN, DEFAULT_IMG_END_TOKEN,
+        DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+    ]
+    tokenizer.add_tokens(tokens_list, special_tokens=True)
+    model = HuskyForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path, ignore_mismatched_sizes=True
+    )
+    embedding_size = model.language_model.get_input_embeddings().weight.shape[0]
+    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
+    # on a small vocab and want a smaller embedding size, remove this test.
+    if len(tokenizer) > embedding_size:
+        model.resize_token_embeddings(len(tokenizer))
+        model.language_model.resize_token_embeddings(len(tokenizer))
+    model.config.text_config.vocab_size = len(tokenizer)
+    model.config.use_cache = False
+    def _freeze_params(module):
+        for param in module.parameters():
+            param.requires_grad = False
+    if model_args.freeze_model:
+        _freeze_params(model)
+        # only update language projection
+        model.language_projection.weight.requires_grad = True
+    if model_args.freeze_vision_model:
+        model.vision_model = model.vision_model.eval()
+        _freeze_params(model.vision_model)
+    if model_args.freeze_vision_adapter:
+        _freeze_params(model.vision_adapter)
+    if model_args.freeze_qformer:
+        model.qformer = model.qformer.eval()
+        _freeze_params(model.qformer)
+        model.query_tokens.requires_grad = False
+    if model_args.freeze_text_model:
+        _freeze_params(model.language_model)
+    if model_args.use_lora:
+        training_args.ddp_find_unused_parameters = False
+        _freeze_params(model)
+        lora_config = LoraConfig(
+            r=16,
+            target_modules=["q_proj", "v_proj"],
+            lora_alpha=32,
+            lora_dropout=0.05,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+        model.language_model = get_peft_model(model.language_model, lora_config)
+        model.language_model.print_trainable_parameters()
+    if model_args.un_freeze_video_embedding:
+        _freeze_params(model)
+        model.vision_model.video_embeddings.patch_embedding.weight.requires_grad = True
+        model.vision_model.video_embeddings.class_embedding.requires_grad = True
+        model.vision_model.video_embeddings.position_embedding.requires_grad = True
+    if model_args.un_freeze_llm_head:
+        model.language_model.lm_head.weight.requires_grad = True
+    # set seed for torch dataloaders
+    set_seed(training_args.seed)
+    # 7. Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    # set padding.
+    padding = "max_length" if data_args.pad_to_max_length else False
+    def husky_processor(examples):
+        processor = partial(
+            process_func,
+            tokenizer=tokenizer,
+            max_seq_length=data_args.max_seq_length,
+        )
+        model_inputs = processor(examples)
+        return model_inputs
+    # Data collator
+    label_pad_token_id = IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+    if data_args.pad_to_max_length:
+        data_collator = default_data_collator
+    else:
+        data_collator = DataCollatorForSeq2Seq(
+            tokenizer,
+            model=model,
+            label_pad_token_id=label_pad_token_id,
+            pad_to_multiple_of=8 if training_args.fp16 else None,
+        )
+    concat_dataset = []
+    for data in ds:
+        data_file = data["text_file"]
+        extension = data_file.split(".")[-1]
+        extension = "json" if extension == "jsonl" else extension
+        logger.info(f"Loading dataset: {data['data_name']}")
+        raw_dataset = load_dataset(extension, data_files=data_file, num_proc=cpu_count(), split="train")
+        if data["data_type"] == "base":
+            temp = BaseDataset(
+                raw_dataset,
+                processor=husky_processor,
+                image_path=data["image_path"],
+                input_size=data_args.input_size
+            )
+        else:
+            temp = CephDataset(
+                raw_dataset,
+                processor=husky_processor,
+                input_size=data_args.input_size
+            )
+        concat_dataset.append(temp)
+    logger.info(f"All datasets have been loaded!")
+    if len(concat_dataset) > 1:
+        train_dataset = ConcatDataset(concat_dataset)
+        # train_dataset = train_dataset.shuffle(seed=42)
+    else:
+        train_dataset = concat_dataset[0]
+    # 8. Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+    )
+    # 9. Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        if model_args.use_lora:
+            model.language_model.save_pretrained(training_args.output_dir)
+        else:
+            trainer.save_model()  # Saves the tokenizer too for easy upload
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+if __name__ == "__main__":
+    main()

robohusky/train/train_uni.py ADDED Viewed

	@@ -0,0 +1,603 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright Qing-Long Zhang. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for sequence to sequence.
+"""
+import json
+import logging
+import os
+import sys
+import warnings
+from functools import partial
+from multiprocessing import cpu_count
+from typing import Optional
+from dataclasses import dataclass, field
+from torch.utils.data import Dataset, ConcatDataset
+from datasets import load_dataset, load_from_disk
+from robohusky.dist_utils import init_dist
+from robohusky.model.modeling_husky_embody2 import HuskyForConditionalGeneration
+import transformers
+from transformers import (
+    HfArgumentParser,
+    TrainingArguments,
+    LlamaTokenizer,
+    Trainer,
+    set_seed,
+    default_data_collator,
+    DataCollatorForSeq2Seq,
+)
+from peft import (
+    LoraConfig,
+    get_peft_model,
+    prepare_model_for_int8_training,
+)
+from robohusky.base_dataset_uni import (
+    process_func,
+    BaseDataset,
+    WeightedConcatDataset
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+from transformers.utils.logging import (
+    set_verbosity_info,
+    set_verbosity,
+    enable_default_handler,
+    enable_explicit_format,
+)
+from robohusky.train.llama_flash_attn_monkey_patch import (
+    replace_llama_attn_with_flash_attn
+)
+from robohusky.train.llama_rmsnorm_monkey_patch import (
+    replace_llama_rmsnorm_with_fused_rmsnorm
+)
+replace_llama_attn_with_flash_attn()
+replace_llama_rmsnorm_with_fused_rmsnorm()
+IGNORE_INDEX = -100
+DEFAULT_UNK_TOKEN = "<unk>"
+DEFAULT_IMG_START_TOKEN = "<img>"
+DEFAULT_IMG_END_TOKEN = "</img>"
+DEFAULT_VIDEO_START_TOKEN = "<vid>"
+DEFAULT_VIDEO_END_TOKEN = "</vid>"
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.32.0.dev0")
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
+warnings.filterwarnings('ignore')
+logger = logging.getLogger(__name__)
+os.environ["WANDB_DISABLED"] = "true"
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=False,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
+                "with private models)."
+            )
+        },
+    )
+    freeze_model: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
+    )
+    freeze_vision_model: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained vision model whose head dimensions are different."},
+    )
+    freeze_vision_adapter: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained vision adapter whose head dimensions are different."},
+    )
+    freeze_text_model: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained text model whose head dimensions are different."},
+    )
+    freeze_qformer: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained qformer model whose head dimensions are different."},
+    )
+    un_freeze_vision_embedding: bool = field(
+        default=False,
+        metadata={"help": "Will enable to tuning image patch_embedding when vision_model are frozen"},
+    )
+    un_freeze_video_embedding: bool = field(
+        default=False,
+        metadata={"help": "Will enable to tuning video patch_embedding when vision_model are frozen"},
+    )
+    un_freeze_llm_head: bool = field(
+        default=False,
+        metadata={"help": "Will enable to tuning video patch_embedding when vision_model are frozen"},
+    )
+    use_lora: bool = field(
+        default=False, metadata={"help": "add the LoRA adapters to the base model"}
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    data_dir: Optional[str] = field(
+        default=None, metadata={"help": "The data directory containing input files."})
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a jsonlines)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input evaluation data file to evaluate the metrics (sacrebleu) on a jsonlines file."
+        },
+    )
+    train_val_split: Optional[float] = field(
+        default=0.0, metadata={"help": "Percent to split off of train for validation."}
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input test data file to evaluate the metrics (sacrebleu) on a jsonlines file."},
+    )
+    image_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional image path"},
+    )
+    video_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional video path"},
+    )
+    input_size: Optional[int] = field(
+        default=224,
+        metadata={"help": "The input size of images."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_seq_length: Optional[int] = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to pad all samples to model maximum sentence length. "
+                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+                "efficient on GPU but very bad for TPU."
+            )
+        },
+    )
+    val_max_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
+                "during ``evaluate`` and ``predict``."
+            )
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+                "value if set."
+            )
+        },
+    )
+    conv_style: Optional[str] = field(
+        default=None, metadata={"help": "prompt style for a conversation."}
+    )
+    save_data_path: Optional[str] = field(
+        default=None, metadata={"help": "prompt style for a conversation."}
+    )
+    num_beams: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
+                "which is used during ``evaluate`` and ``predict``."
+            )
+        },
+    )
+    ignore_pad_token_for_loss: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
+        },
+    )
+    source_prefix: Optional[str] = field(
+        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
+    )
+    forced_bos_token: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to force as the first generated token after the :obj:`decoder_start_token_id`.Useful for"
+                " multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token needs to"
+                " be the target language token.(Usually it is the target language token)"
+            )
+        },
+    )
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        # accepting both json and jsonl file extensions, as
+        # many jsonlines files actually have a .json extension
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "jsonl", "parquet"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "jsonl",
+                                     "parquet"], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension == "json", "`test_file` should be a json file."
+def main():
+    # 1. Parse input arguments
+    # See all possible arguments in src/transformers/training_args.py
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    init_dist(launcher='slurm', backend='nccl', port=29598)
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script, and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # 2. Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    set_verbosity(log_level)
+    enable_default_handler()
+    enable_explicit_format()
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # 3. Detecting last checkpoint and eventually continue from last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # 4. Get the datasets
+    # you can either provide your own JSON training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        ds = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            data_dir=data_args.data_dir,
+            cache_dir=model_args.cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        # ds = load_dataset(
+        #     "json" if extension == "jsonl" else extension,
+        #     data_files=data_files,
+        #     split="train"
+        # )
+        ds = json.load(open(data_args.train_file, "r"))
+    # 5. Load pretrained model, tokenizer, and image processor
+    #
+    # Distributed training: The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    tokenizer = LlamaTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        legacy=True,
+    )
+    # add special token
+    tokenizer.pad_token_id = 0
+    if tokenizer.unk_token is None:
+        tokenizer.add_special_tokens({"unk_token": DEFAULT_UNK_TOKEN})
+    tokens_list = [
+        DEFAULT_IMG_START_TOKEN, DEFAULT_IMG_END_TOKEN,
+        DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN
+    ]
+    tokenizer.add_tokens(tokens_list, special_tokens=True)
+    model = HuskyForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path, ignore_mismatched_sizes=True
+    )
+    embedding_size = model.language_model.get_input_embeddings().weight.shape[0]
+    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
+    # on a small vocab and want a smaller embedding size, remove this test.
+    if len(tokenizer) > embedding_size:
+        model.resize_token_embeddings(len(tokenizer))
+        model.language_model.resize_token_embeddings(len(tokenizer))
+    model.config.text_config.vocab_size = len(tokenizer)
+    model.config.use_cache = False
+    def _freeze_params(module):
+        for param in module.parameters():
+            param.requires_grad = False
+    if model_args.freeze_model:
+        _freeze_params(model)
+        # only update language projection
+        model.language_projection.weight.requires_grad = True
+    if model_args.freeze_vision_model:
+        model.vision_model = model.vision_model.eval()
+        _freeze_params(model.vision_model)
+    if model_args.freeze_vision_adapter:
+        _freeze_params(model.vision_adapter)
+    if model_args.freeze_qformer:
+        model.qformer = model.qformer.eval()
+        _freeze_params(model.qformer)
+        model.query_tokens.requires_grad = False
+    if model_args.freeze_text_model:
+        _freeze_params(model.language_model)
+    if model_args.use_lora:
+        training_args.ddp_find_unused_parameters = False
+        _freeze_params(model)
+        lora_config = LoraConfig(
+            r=16,
+            target_modules=["q_proj", "v_proj"],
+            lora_alpha=32,
+            lora_dropout=0.05,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+        model.language_model = get_peft_model(model.language_model, lora_config)
+        model.language_model.print_trainable_parameters()
+    if model_args.un_freeze_video_embedding:
+        _freeze_params(model)
+        model.vision_model.video_embeddings.patch_embedding.weight.requires_grad = True
+        model.vision_model.video_embeddings.class_embedding.requires_grad = True
+        model.vision_model.video_embeddings.position_embedding.requires_grad = True
+    if model_args.un_freeze_llm_head:
+        model.language_model.lm_head.weight.requires_grad = True
+    # set seed for torch dataloaders
+    set_seed(training_args.seed)
+    # 7. Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    # set padding.
+    padding = "max_length" if data_args.pad_to_max_length else False
+    def husky_processor(examples):
+        processor = partial(
+            process_func,
+            tokenizer=tokenizer,
+            max_seq_length=data_args.max_seq_length,
+        )
+        model_inputs = processor(examples)
+        return model_inputs
+    # Data collator
+    label_pad_token_id = IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+    if data_args.pad_to_max_length:
+        data_collator = default_data_collator
+    else:
+        data_collator = DataCollatorForSeq2Seq(
+            tokenizer,
+            model=model,
+            label_pad_token_id=label_pad_token_id,
+            pad_to_multiple_of=8 if training_args.fp16 else None,
+        )
+    concat_dataset = []
+    weights = []
+    batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps * training_args.world_size
+    for data in ds:
+        data_name = data['data_name']
+        data_file = data["text_file"]
+        extension = data_file.split(".")[-1]
+        extension = "json" if extension == "jsonl" else extension
+        logger.info(f"Loading dataset: {data_name}")
+        raw_dataset = load_dataset(extension, data_files=data_file, num_proc=cpu_count(), split="train")
+        raw_dataset = raw_dataset.shuffle(seed=0)
+        max_train_sample = min(len(raw_dataset), batch_size * (len(raw_dataset) // batch_size))
+        raw_dataset = raw_dataset.select(range(max_train_sample))
+        media_type = data["data_type"]
+        input_size = data_args.video_size if media_type == "video" else data_args.input_size
+        temp = BaseDataset(
+            raw_dataset,
+            processor=husky_processor,
+            image_path=data["image_path"],
+            input_size=input_size,
+            num_segments=8,
+            norm_type="openai",
+            media_type=media_type
+        )
+        concat_dataset.append(temp)
+        weights.append(1 / len(temp))
+    logger.info(f"All datasets have been loaded!")
+    if len(concat_dataset) > 1:
+        train_dataset = WeightedConcatDataset(datasets=concat_dataset, weights=weights, batch_size=batch_size)
+    else:
+        train_dataset = concat_dataset[0]
+    # 8. Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+    )
+    # 9. Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        if model_args.use_lora:
+            model.language_model.save_pretrained(training_args.output_dir)
+        else:
+            trainer.save_model()  # Saves the tokenizer too for easy upload
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+if __name__ == "__main__":
+    main()

robohusky/utils.py ADDED Viewed

	@@ -0,0 +1,238 @@

+from asyncio import AbstractEventLoop
+import json
+import logging
+import logging.handlers
+import os
+import platform
+import sys
+from typing import AsyncGenerator, Generator
+import warnings
+import requests
+import torch
+from husky.constants import LOGDIR
+handler = None
+def build_logger(logger_name, logger_filename):
+    global handler
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        if sys.version_info[1] >= 9:
+            # This is for windows
+            logging.basicConfig(level=logging.INFO, encoding="utf-8")
+        else:
+            if platform.system() == "Windows":
+                warnings.warn(
+                    "If you are running on Windows, "
+                    "we recommend you use Python >= 3.9 for UTF-8 encoding."
+                )
+            logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when="D", utc=True, encoding="utf-8"
+        )
+        handler.setFormatter(formatter)
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+    return logger
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ""
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ""
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == "\n":
+                encoded_message = line.encode("utf-8", "ignore").decode("utf-8")
+                self.logger.log(self.log_level, encoded_message.rstrip())
+            else:
+                self.linebuf += line
+    def flush(self):
+        if self.linebuf != "":
+            encoded_message = self.linebuf.encode("utf-8", "ignore").decode("utf-8")
+            self.logger.log(self.log_level, encoded_message.rstrip())
+        self.linebuf = ""
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+def get_gpu_memory(max_gpus=None):
+    """Get available memory for each GPU."""
+    gpu_memory = []
+    num_gpus = (
+        torch.cuda.device_count()
+        if max_gpus is None
+        else min(max_gpus, torch.cuda.device_count())
+    )
+    for gpu_id in range(num_gpus):
+        with torch.cuda.device(gpu_id):
+            device = torch.cuda.current_device()
+            gpu_properties = torch.cuda.get_device_properties(device)
+            total_memory = gpu_properties.total_memory / (1024 ** 3)
+            allocated_memory = torch.cuda.memory_allocated() / (1024 ** 3)
+            available_memory = total_memory - allocated_memory
+            gpu_memory.append(available_memory)
+    return gpu_memory
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"],
+    }
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+    return flagged
+# Flan-t5 trained with HF+FSDP saves corrupted  weights for shared embeddings,
+# Use this function to make sure it can be correctly loaded.
+def clean_flant5_ckpt(ckpt_path):
+    index_file = os.path.join(ckpt_path, "pytorch_model.bin.index.json")
+    index_json = json.load(open(index_file, "r"))
+    weightmap = index_json["weight_map"]
+    share_weight_file = weightmap["shared.weight"]
+    share_weight = torch.load(os.path.join(ckpt_path, share_weight_file))[
+        "shared.weight"
+    ]
+    for weight_name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]:
+        weight_file = weightmap[weight_name]
+        weight = torch.load(os.path.join(ckpt_path, weight_file))
+        weight[weight_name] = share_weight
+        torch.save(weight, os.path.join(ckpt_path, weight_file))
+def pretty_print_semaphore(semaphore):
+    """Print a semaphore in better format."""
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
+"""A javascript function to get url parameters for the gradio web server."""
+get_window_url_params_js = """
+function() {
+    const params = new URLSearchParams(window.location.search);
+    url_params = Object.fromEntries(params);
+    console.log("url_params", url_params);
+    return url_params;
+    }
+"""
+def iter_over_async(
+        async_gen: AsyncGenerator, event_loop: AbstractEventLoop
+) -> Generator:
+    """
+    Convert async generator to sync generator
+    :param async_gen: the AsyncGenerator to convert
+    :param event_loop: the event loop to run on
+    :returns: Sync generator
+    """
+    ait = async_gen.__aiter__()
+    async def get_next():
+        try:
+            obj = await ait.__anext__()
+            return False, obj
+        except StopAsyncIteration:
+            return True, None
+    while True:
+        done, obj = event_loop.run_until_complete(get_next())
+        if done:
+            break
+        yield obj
+def detect_language(text: str) -> str:
+    """Detect the langauge of a string."""
+    import polyglot  # pip3 install polyglot pyicu pycld2
+    from polyglot.detect import Detector
+    from polyglot.detect.base import logger as polyglot_logger
+    import pycld2
+    polyglot_logger.setLevel("ERROR")
+    try:
+        lang_code = Detector(text).language.name
+    except (pycld2.error, polyglot.detect.base.UnknownLanguage):
+        lang_code = "unknown"
+    return lang_code

robohusky/video_transformers.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import torchvision
+import random
+from PIL import Image, ImageOps
+import numpy as np
+import numbers
+import math
+import torch
+class GroupRandomCrop(object):
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+    def __call__(self, img_group):
+        w, h = img_group[0].size
+        th, tw = self.size
+        out_images = list()
+        x1 = random.randint(0, w - tw)
+        y1 = random.randint(0, h - th)
+        for img in img_group:
+            assert (img.size[0] == w and img.size[1] == h)
+            if w == tw and h == th:
+                out_images.append(img)
+            else:
+                out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+        return out_images
+class MultiGroupRandomCrop(object):
+    def __init__(self, size, groups=1):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+        self.groups = groups
+    def __call__(self, img_group):
+        w, h = img_group[0].size
+        th, tw = self.size
+        out_images = list()
+        for i in range(self.groups):
+            x1 = random.randint(0, w - tw)
+            y1 = random.randint(0, h - th)
+            for img in img_group:
+                assert (img.size[0] == w and img.size[1] == h)
+                if w == tw and h == th:
+                    out_images.append(img)
+                else:
+                    out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+        return out_images
+class GroupCenterCrop(object):
+    def __init__(self, size):
+        self.worker = torchvision.transforms.CenterCrop(size)
+    def __call__(self, img_group):
+        return [self.worker(img) for img in img_group]
+class GroupRandomHorizontalFlip(object):
+    """Randomly horizontally flips the given PIL.Image with a probability of 0.5
+    """
+    def __init__(self, is_flow=False):
+        self.is_flow = is_flow
+    def __call__(self, img_group, is_flow=False):
+        v = random.random()
+        if v < 0.5:
+            ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
+            if self.is_flow:
+                for i in range(0, len(ret), 2):
+                    # invert flow pixel values when flipping
+                    ret[i] = ImageOps.invert(ret[i])
+            return ret
+        else:
+            return img_group
+class GroupNormalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, tensor):
+        rep_mean = self.mean * (tensor.size()[0] // len(self.mean))
+        rep_std = self.std * (tensor.size()[0] // len(self.std))
+        # TODO: make efficient
+        for t, m, s in zip(tensor, rep_mean, rep_std):
+            t.sub_(m).div_(s)
+        return tensor
+class GroupScale(object):
+    """ Rescales the input PIL.Image to the given 'size'.
+    'size' will be the size of the smaller edge.
+    For example, if height > width, then image will be
+    rescaled to (size * height / width, size)
+    size: size of the smaller edge
+    interpolation: Default: PIL.Image.BILINEAR
+    """
+    def __init__(self, size, interpolation=Image.BILINEAR):
+        self.worker = torchvision.transforms.Resize(size, interpolation)
+    def __call__(self, img_group):
+        return [self.worker(img) for img in img_group]
+class GroupOverSample(object):
+    def __init__(self, crop_size, scale_size=None, flip=True):
+        self.crop_size = crop_size if not isinstance(
+            crop_size, int) else (crop_size, crop_size)
+        if scale_size is not None:
+            self.scale_worker = GroupScale(scale_size)
+        else:
+            self.scale_worker = None
+        self.flip = flip
+    def __call__(self, img_group):
+        if self.scale_worker is not None:
+            img_group = self.scale_worker(img_group)
+        image_w, image_h = img_group[0].size
+        crop_w, crop_h = self.crop_size
+        offsets = GroupMultiScaleCrop.fill_fix_offset(
+            False, image_w, image_h, crop_w, crop_h)
+        oversample_group = list()
+        for o_w, o_h in offsets:
+            normal_group = list()
+            flip_group = list()
+            for i, img in enumerate(img_group):
+                crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
+                normal_group.append(crop)
+                flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
+                if img.mode == 'L' and i % 2 == 0:
+                    flip_group.append(ImageOps.invert(flip_crop))
+                else:
+                    flip_group.append(flip_crop)
+            oversample_group.extend(normal_group)
+            if self.flip:
+                oversample_group.extend(flip_group)
+        return oversample_group
+class GroupFullResSample(object):
+    def __init__(self, crop_size, scale_size=None, flip=True):
+        self.crop_size = crop_size if not isinstance(
+            crop_size, int) else (crop_size, crop_size)
+        if scale_size is not None:
+            self.scale_worker = GroupScale(scale_size)
+        else:
+            self.scale_worker = None
+        self.flip = flip
+    def __call__(self, img_group):
+        if self.scale_worker is not None:
+            img_group = self.scale_worker(img_group)
+        image_w, image_h = img_group[0].size
+        crop_w, crop_h = self.crop_size
+        w_step = (image_w - crop_w) // 4
+        h_step = (image_h - crop_h) // 4
+        offsets = list()
+        offsets.append((0 * w_step, 2 * h_step))  # left
+        offsets.append((4 * w_step, 2 * h_step))  # right
+        offsets.append((2 * w_step, 2 * h_step))  # center
+        oversample_group = list()
+        for o_w, o_h in offsets:
+            normal_group = list()
+            flip_group = list()
+            for i, img in enumerate(img_group):
+                crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
+                normal_group.append(crop)
+                if self.flip:
+                    flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
+                    if img.mode == 'L' and i % 2 == 0:
+                        flip_group.append(ImageOps.invert(flip_crop))
+                    else:
+                        flip_group.append(flip_crop)
+            oversample_group.extend(normal_group)
+            oversample_group.extend(flip_group)
+        return oversample_group
+class GroupMultiScaleCrop(object):
+    def __init__(self, input_size, scales=None, max_distort=1,
+                 fix_crop=True, more_fix_crop=True):
+        self.scales = scales if scales is not None else [1, .875, .75, .66]
+        self.max_distort = max_distort
+        self.fix_crop = fix_crop
+        self.more_fix_crop = more_fix_crop
+        self.input_size = input_size if not isinstance(input_size, int) else [
+            input_size, input_size]
+        self.interpolation = Image.BILINEAR
+    def __call__(self, img_group):
+        im_size = img_group[0].size
+        crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size)
+        crop_img_group = [
+            img.crop(
+                (offset_w,
+                 offset_h,
+                 offset_w +
+                 crop_w,
+                 offset_h +
+                 crop_h)) for img in img_group]
+        ret_img_group = [img.resize((self.input_size[0], self.input_size[1]), self.interpolation)
+                         for img in crop_img_group]
+        return ret_img_group
+    def _sample_crop_size(self, im_size):
+        image_w, image_h = im_size[0], im_size[1]
+        # find a crop size
+        base_size = min(image_w, image_h)
+        crop_sizes = [int(base_size * x) for x in self.scales]
+        crop_h = [
+            self.input_size[1] if abs(
+                x - self.input_size[1]) < 3 else x for x in crop_sizes]
+        crop_w = [
+            self.input_size[0] if abs(
+                x - self.input_size[0]) < 3 else x for x in crop_sizes]
+        pairs = []
+        for i, h in enumerate(crop_h):
+            for j, w in enumerate(crop_w):
+                if abs(i - j) <= self.max_distort:
+                    pairs.append((w, h))
+        crop_pair = random.choice(pairs)
+        if not self.fix_crop:
+            w_offset = random.randint(0, image_w - crop_pair[0])
+            h_offset = random.randint(0, image_h - crop_pair[1])
+        else:
+            w_offset, h_offset = self._sample_fix_offset(
+                image_w, image_h, crop_pair[0], crop_pair[1])
+        return crop_pair[0], crop_pair[1], w_offset, h_offset
+    def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h):
+        offsets = self.fill_fix_offset(
+            self.more_fix_crop, image_w, image_h, crop_w, crop_h)
+        return random.choice(offsets)
+    @staticmethod
+    def fill_fix_offset(more_fix_crop, image_w, image_h, crop_w, crop_h):
+        w_step = (image_w - crop_w) // 4
+        h_step = (image_h - crop_h) // 4
+        ret = list()
+        ret.append((0, 0))  # upper left
+        ret.append((4 * w_step, 0))  # upper right
+        ret.append((0, 4 * h_step))  # lower left
+        ret.append((4 * w_step, 4 * h_step))  # lower right
+        ret.append((2 * w_step, 2 * h_step))  # center
+        if more_fix_crop:
+            ret.append((0, 2 * h_step))  # center left
+            ret.append((4 * w_step, 2 * h_step))  # center right
+            ret.append((2 * w_step, 4 * h_step))  # lower center
+            ret.append((2 * w_step, 0 * h_step))  # upper center
+            ret.append((1 * w_step, 1 * h_step))  # upper left quarter
+            ret.append((3 * w_step, 1 * h_step))  # upper right quarter
+            ret.append((1 * w_step, 3 * h_step))  # lower left quarter
+            ret.append((3 * w_step, 3 * h_step))  # lower righ quarter
+        return ret
+class GroupRandomSizedCrop(object):
+    """Random crop the given PIL.Image to a random size of (0.08 to 1.0) of the original size
+    and and a random aspect ratio of 3/4 to 4/3 of the original aspect ratio
+    This is popularly used to train the Inception networks
+    size: size of the smaller edge
+    interpolation: Default: PIL.Image.BILINEAR
+    """
+    def __init__(self, size, interpolation=Image.BILINEAR):
+        self.size = size
+        self.interpolation = interpolation
+    def __call__(self, img_group):
+        for attempt in range(10):
+            area = img_group[0].size[0] * img_group[0].size[1]
+            target_area = random.uniform(0.08, 1.0) * area
+            aspect_ratio = random.uniform(3. / 4, 4. / 3)
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+            if random.random() < 0.5:
+                w, h = h, w
+            if w <= img_group[0].size[0] and h <= img_group[0].size[1]:
+                x1 = random.randint(0, img_group[0].size[0] - w)
+                y1 = random.randint(0, img_group[0].size[1] - h)
+                found = True
+                break
+        else:
+            found = False
+            x1 = 0
+            y1 = 0
+        if found:
+            out_group = list()
+            for img in img_group:
+                img = img.crop((x1, y1, x1 + w, y1 + h))
+                assert (img.size == (w, h))
+                out_group.append(
+                    img.resize(
+                        (self.size, self.size), self.interpolation))
+            return out_group
+        else:
+            # Fallback
+            scale = GroupScale(self.size, interpolation=self.interpolation)
+            crop = GroupRandomCrop(self.size)
+            return crop(scale(img_group))
+class ConvertDataFormat(object):
+    def __init__(self, model_type):
+        self.model_type = model_type
+    def __call__(self, images):
+        if self.model_type == '2D':
+            return images
+        tc, h, w = images.size()
+        t = tc // 3
+        images = images.view(t, 3, h, w)
+        images = images.permute(1, 0, 2, 3)
+        return images
+class Stack(object):
+    def __init__(self, roll=False):
+        self.roll = roll
+    def __call__(self, img_group):
+        if img_group[0].mode == 'L':
+            return np.concatenate([np.expand_dims(x, 2)
+                                   for x in img_group], axis=2)
+        elif img_group[0].mode == 'RGB':
+            if self.roll:
+                return np.concatenate([np.array(x)[:, :, ::-1]
+                                       for x in img_group], axis=2)
+            else:
+                # print(np.concatenate(img_group, axis=2).shape)
+                # print(img_group[0].shape)
+                return np.concatenate(img_group, axis=2)
+class ToTorchFormatTensor(object):
+    """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
+    to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
+    def __init__(self, div=True):
+        self.div = div
+    def __call__(self, pic):
+        if isinstance(pic, np.ndarray):
+            # handle numpy array
+            img = torch.from_numpy(pic).permute(2, 0, 1).contiguous()
+        else:
+            # handle PIL Image
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(
+                    pic.tobytes()))
+            img = img.view(pic.size[1], pic.size[0], len(pic.mode))
+            # put it from HWC to CHW format
+            # yikes, this transpose takes 80% of the loading time/CPU
+            img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        return img.float().div(255) if self.div else img.float()
+class IdentityTransform(object):
+    def __call__(self, data):
+        return data
+def get_index(num_frames, num_segments):
+    seg_size = float(num_frames - 1) / num_segments
+    start = int(seg_size / 2)
+    offsets = np.array([
+        start + int(np.round(seg_size * idx)) for idx in range(num_segments)
+    ])
+    return offsets