diff --git a/.gitattributes b/.gitattributes index 4a9e052dec3f6cf4bf547aafe35fc8a03b2dc4a1..6ac8e9c365bba2e3b98f815e18395577c9879559 100644 --- a/.gitattributes +++ b/.gitattributes @@ -46,3 +46,4 @@ LAVT-RIS/refer/data/refcocog/instances.json filter=lfs diff=lfs merge=lfs -text LAVT-RIS/refer/data/refcocog/refs(google).p filter=lfs diff=lfs merge=lfs -text LAVT-RIS/refer/data/refcocog/refs(umd).p filter=lfs diff=lfs merge=lfs -text LAVT-RIS/refer/evaluation/tokenizer/stanford-corenlp-3.4.1.jar filter=lfs diff=lfs merge=lfs -text +hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b filter=lfs diff=lfs merge=lfs -text diff --git a/.history/datasets/a2d_20250203174308.py b/.history/datasets/a2d_20250203174308.py new file mode 100644 index 0000000000000000000000000000000000000000..b826e1310fccfc636bc415d94edbdb384042212f --- /dev/null +++ b/.history/datasets/a2d_20250203174308.py @@ -0,0 +1,247 @@ +""" +A2D-Sentences data loader +modified from https://github.com/mttr2021/MTTR/blob/main/datasets/a2d_sentences/a2d_sentences_dataset.py +""" +from pathlib import Path + +import torch +from torchvision.io import read_video +import torchvision.transforms.functional as F + +from torch.utils.data import Dataset +import datasets.transforms_video as T + +import os +from PIL import Image +import json +import numpy as np +import random + +import h5py +from pycocotools.mask import encode, area + + +def get_image_id(video_id, frame_idx, ref_instance_a2d_id): + image_id = f'v_{video_id}_f_{frame_idx}_i_{ref_instance_a2d_id}' + return image_id + +class A2DSentencesDataset(Dataset): + """ + A Torch dataset for A2D-Sentences. + For more information check out: https://kgavrilyuk.github.io/publication/actor_action/ or the original paper at: + https://arxiv.org/abs/1803.07485 + """ + def __init__(self, image_folder: Path, ann_file: Path, transforms, return_masks: bool, + num_frames: int, max_skip: int, subset): + super(A2DSentencesDataset, self).__init__() + dataset_path = str(image_folder) + self.mask_annotations_dir = os.path.join(dataset_path, 'text_annotations/a2d_annotation_with_instances') + self.videos_dir = os.path.join(dataset_path, 'Release/clips320H') + self.ann_file = ann_file + self.text_annotations = self.get_text_annotations() + + self._transforms = transforms + self.return_masks = return_masks # not used + self.num_frames = num_frames + self.max_skip = max_skip + self.subset = subset + + print(f'\n {subset} sample num: ', len(self.text_annotations)) + print('\n') + + def get_text_annotations(self): + with open(str(self.ann_file), 'r') as f: + text_annotations_by_frame = [tuple(a) for a in json.load(f)] + return text_annotations_by_frame + + @staticmethod + def bounding_box(img): + rows = np.any(img, axis=1) + cols = np.any(img, axis=0) + rmin, rmax = np.where(rows)[0][[0, -1]] + cmin, cmax = np.where(cols)[0][[0, -1]] + return rmin, rmax, cmin, cmax # y1, y2, x1, x2 + + def __len__(self): + return len(self.text_annotations) + + def __getitem__(self, idx): + instance_check = False + while not instance_check: + text_query, video_id, frame_idx, instance_id = self.text_annotations[idx] + + text_query = " ".join(text_query.lower().split()) # clean up the text query + + # read the source window frames: + video_frames, _, _ = read_video(os.path.join(self.videos_dir, f'{video_id}.mp4'), pts_unit='sec') # (T, H, W, C) + vid_len = len(video_frames) + # note that the original a2d dataset is 1 indexed, so we have to subtract 1 from frame_idx + frame_id = frame_idx - 1 + + if self.subset == 'train': + # get a window of window_size frames with frame frame_id in the middle. + num_frames = self.num_frames + # random sparse sample + sample_indx = [frame_id] + # local sample + sample_id_before = random.randint(1, 3) + sample_id_after = random.randint(1, 3) + local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)] + sample_indx.extend(local_indx) + + # global sampling + if num_frames > 3: + all_inds = list(range(vid_len)) + global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):] + global_n = num_frames - len(sample_indx) + if len(global_inds) > global_n: + select_id = random.sample(range(len(global_inds)), global_n) + for s_id in select_id: + sample_indx.append(global_inds[s_id]) + elif vid_len >=global_n: # sample long range global frames + select_id = random.sample(range(vid_len), global_n) + for s_id in select_id: + sample_indx.append(all_inds[s_id]) + else: + select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len)) + for s_id in select_id: + sample_indx.append(all_inds[s_id]) + sample_indx.sort() + # find the valid frame index in sampled frame list, there is only one valid frame + valid_indices = sample_indx.index(frame_id) + + elif self.subset == 'val': + start_idx, end_idx = frame_id - self.num_frames // 2, frame_id + (self.num_frames + 1) // 2 + sample_indx = [] + for i in range(start_idx, end_idx): + i = min(max(i, 0), len(video_frames)-1) # pad out of range indices with edge frames + sample_indx.append(i) + sample_indx.sort() + # find the valid frame index in sampled frame list, there is only one valid frame + valid_indices = sample_indx.index(frame_id) + + + # read frames + imgs, labels, boxes, masks, valid = [], [], [], [], [] + for j in range(self.num_frames): + frame_indx = sample_indx[j] + img = F.to_pil_image(video_frames[frame_indx].permute(2, 0, 1)) + imgs.append(img) + + # read the instance mask + frame_annot_path = os.path.join(self.mask_annotations_dir, video_id, f'{frame_idx:05d}.h5') + f = h5py.File(frame_annot_path) + instances = list(f['instance']) + instance_idx = instances.index(instance_id) # existence was already validated during init + + instance_masks = np.array(f['reMask']) + if len(instances) == 1: + instance_masks = instance_masks[np.newaxis, ...] + instance_masks = torch.tensor(instance_masks).transpose(1, 2) + mask_rles = [encode(mask) for mask in instance_masks.numpy()] + mask_areas = area(mask_rles).astype(float) + f.close() + + # select the referred mask + label = torch.tensor(0, dtype=torch.long) + mask = instance_masks[instance_idx].numpy() + if (mask > 0).any(): + y1, y2, x1, x2 = self.bounding_box(mask) + box = torch.tensor([x1, y1, x2, y2]).to(torch.float) + valid.append(1) + else: # some frame didn't contain the instance + box = torch.tensor([0, 0, 0, 0]).to(torch.float) + valid.append(0) + mask = torch.from_numpy(mask) + labels.append(label) + boxes.append(box) + masks.append(mask) + + # transform + h, w = instance_masks.shape[-2:] + labels = torch.stack(labels, dim=0) + boxes = torch.stack(boxes, dim=0) + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + masks = torch.stack(masks, dim=0) + # there is only one valid frame + target = { + 'frames_idx': torch.tensor(sample_indx), # [T,] + 'valid_indices': torch.tensor([valid_indices]), + 'labels': labels, # [1,] + 'boxes': boxes, # [1, 4], xyxy + 'masks': masks, # [1, H, W] + 'valid': torch.tensor(valid), # [1,] + 'caption': text_query, + 'orig_size': torch.as_tensor([int(h), int(w)]), + 'size': torch.as_tensor([int(h), int(w)]), + 'image_id': get_image_id(video_id,frame_idx, instance_id) + } + + # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform + if self._transforms: + imgs, target = self._transforms(imgs, target) + imgs = torch.stack(imgs, dim=0) # [T, 3, H, W] + else: + imgs = np.array(imgs) + imgs = torch.tensor(imgs.transpose(0, 3, 1, 2)) + + # FIXME: handle "valid", since some box may be removed due to random crop + if torch.any(target['valid'] == 1): # at leatst one instance + instance_check = True + else: + idx = random.randint(0, self.__len__() - 1) + + return imgs, target + + +def make_coco_transforms(image_set, max_size=640): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [288, 320, 352, 392, 416, 448, 480, 512] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.PhotometricDistort(), + T.RandomSelect( + T.Compose([ + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]) + ), + normalize, + ]) + + # we do not use the 'val' set since the annotations are inaccessible + if image_set == 'val': + return T.Compose([ + T.RandomResize([360], max_size=640), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.a2d_path) + assert root.exists(), f'provided A2D-Sentences path {root} does not exist' + PATHS = { + "train": (root, root / "a2d_sentences_single_frame_train_annotations.json"), + "val": (root, root / "a2d_sentences_single_frame_test_annotations.json"), + } + img_folder, ann_file = PATHS[image_set] + #dataset = A2DSentencesDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), + # return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set) + dataset = A2DSentencesDataset(img_folder, ann_file, transforms=None, + return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set) + return dataset \ No newline at end of file diff --git a/.history/datasets/ytvos_ref_20250113131134.py b/.history/datasets/ytvos_ref_20250113131134.py new file mode 100644 index 0000000000000000000000000000000000000000..49b8fede62b2b4e0fdfbaf3aa9f3902002a82acc --- /dev/null +++ b/.history/datasets/ytvos_ref_20250113131134.py @@ -0,0 +1,241 @@ +""" +Ref-YoutubeVOS data loader +""" +from pathlib import Path + +import torch +from torch.autograd.grad_mode import F +from torch.utils.data import Dataset +import datasets.transforms_video as T + +import os +from PIL import Image +import json +import numpy as np +import random + +from datasets.categories import ytvos_category_dict as category_dict + + +class YTVOSDataset(Dataset): + """ + A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper: + "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark" + (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf). + The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first + dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download + through the Youtube-VOS referring video object segmentation competition page at: + https://competitions.codalab.org/competitions/29139 + Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into + two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can + currently only be done on the competition 'validation' subset using the competition's server, as + annotations were publicly released only for the 'train' subset of the competition. + + """ + def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, + num_frames: int, max_skip: int): + self.img_folder = img_folder + self.ann_file = ann_file + self._transforms = transforms + self.return_masks = return_masks # not used + self.num_frames = num_frames + self.max_skip = max_skip + # create video meta data + self.prepare_metas() + + print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas)) + print('\n') + + def prepare_metas(self): + # read object information + with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f: + subset_metas_by_video = json.load(f)['videos'] + + # read expression data + with open(str(self.ann_file), 'r') as f: + subset_expressions_by_video = json.load(f)['videos'] + self.videos = list(subset_expressions_by_video.keys()) + + self.metas = [] + for vid in self.videos: + vid_meta = subset_metas_by_video[vid] + vid_data = subset_expressions_by_video[vid] + vid_frames = sorted(vid_data['frames']) + vid_len = len(vid_frames) + + print(vid_meta) + + for exp_id, exp_dict in vid_data['expressions'].items(): + print(exp_dict) + # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2) + start_idx , end_idx = 2, vid_len-2 + bin_size = (end_idx - start_idx) // 4 + + bins = [] + for i in range(4): + bin_start = start_idx + i * bin_size + bin_end = bin_start + bin_size if i < 3 else end_idx + + bins.append((bin_start, bin_end)) + + + meta = { + 'video': vid, + 'exp': exp_dict['exp'], + 'obj_id': int(exp_dict['obj_id']), + 'frames': vid_frames, + 'bins': bins, + 'category': vid_meta['objects'][int(exp_dict['obj_id'])]['category'] + } + self.metas.append(meta) + + + @staticmethod + def bounding_box(img): + rows = np.any(img, axis=1) + cols = np.any(img, axis=0) + rmin, rmax = np.where(rows)[0][[0, -1]] + cmin, cmax = np.where(cols)[0][[0, -1]] + return rmin, rmax, cmin, cmax # y1, y2, x1, x2 + + def __len__(self): + return len(self.metas) + + def __getitem__(self, idx): + instance_check = False + while not instance_check: + meta = self.metas[idx] # dict + + + video, exp, obj_id, category, frames, bins = \ + meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins'] + + + # clean up the caption + exp = " ".join(exp.lower().split()) + category_id = category_dict[category] + vid_len = len(frames) + + # num_frames = self.num_frames + + # Random sample one frame from each bin + sample_indx = [] + for start_idx, end_idx in bins: + sample_indx.append(random.randint(start_idx, end_idx - 1)) + sample_indx.sort() # Ensure indices are in order + + # read frames and masks + imgs, labels, boxes, masks, valid = [], [], [], [], [] + for frame_indx in sample_indx: + frame_name = frames[frame_indx] + img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg') + mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png') + img = Image.open(img_path).convert('RGB') + mask = Image.open(mask_path).convert('P') + + # create the target + label = torch.tensor(category_id) + mask = np.array(mask) + mask = (mask==obj_id).astype(np.float32) # 0,1 binary + if (mask > 0).any(): + y1, y2, x1, x2 = self.bounding_box(mask) + box = torch.tensor([x1, y1, x2, y2]).to(torch.float) + valid.append(1) + else: # some frame didn't contain the instance + box = torch.tensor([0, 0, 0, 0]).to(torch.float) + valid.append(0) + mask = torch.from_numpy(mask) + + # append + imgs.append(img) + labels.append(label) + masks.append(mask) + boxes.append(box) + + # transform + w, h = img.size + labels = torch.stack(labels, dim=0) + boxes = torch.stack(boxes, dim=0) + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + masks = torch.stack(masks, dim=0) + target = { + 'frames_idx': torch.tensor(sample_indx), # [T,] + 'labels': labels, # [T,] + 'boxes': boxes, # [T, 4], xyxy + 'masks': masks, # [T, H, W] + 'valid': torch.tensor(valid), # [T,] + 'caption': exp, + 'orig_size': torch.as_tensor([int(h), int(w)]), + 'size': torch.as_tensor([int(h), int(w)]) + } + + # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform + if self._transforms: + imgs, target = self._transforms(imgs, target) + imgs = torch.stack(imgs, dim=0) # [T, 3, H, W] + else: + imgs = np.array(imgs) + imgs = torch.tensor(imgs.transpose(0, 3, 1, 2)) + + + # FIXME: handle "valid", since some box may be removed due to random crop + if torch.any(target['valid'] == 1): # at leatst one instance + instance_check = True + else: + idx = random.randint(0, self.__len__() - 1) + + return imgs, target + + +def make_coco_transforms(image_set, max_size=640): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [288, 320, 352, 392, 416, 448, 480, 512] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.PhotometricDistort(), + T.RandomSelect( + T.Compose([ + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]) + ), + normalize, + ]) + + # we do not use the 'val' set since the annotations are inaccessible + if image_set == 'val': + return T.Compose([ + T.RandomResize([360], max_size=640), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.ytvos_path) + assert root.exists(), f'provided YTVOS path {root} does not exist' + PATHS = { + "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"), + "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually + } + img_folder, ann_file = PATHS[image_set] + # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, + # num_frames=args.num_frames, max_skip=args.max_skip) + dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, + num_frames=args.num_frames, max_skip=args.max_skip) + return dataset + diff --git a/.history/datasets/ytvos_ref_20250113131327.py b/.history/datasets/ytvos_ref_20250113131327.py new file mode 100644 index 0000000000000000000000000000000000000000..a8bce1e2bef3f75fe70214de4116d034bb4c681f --- /dev/null +++ b/.history/datasets/ytvos_ref_20250113131327.py @@ -0,0 +1,241 @@ +""" +Ref-YoutubeVOS data loader +""" +from pathlib import Path + +import torch +from torch.autograd.grad_mode import F +from torch.utils.data import Dataset +import datasets.transforms_video as T + +import os +from PIL import Image +import json +import numpy as np +import random + +from datasets.categories import ytvos_category_dict as category_dict + + +class YTVOSDataset(Dataset): + """ + A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper: + "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark" + (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf). + The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first + dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download + through the Youtube-VOS referring video object segmentation competition page at: + https://competitions.codalab.org/competitions/29139 + Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into + two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can + currently only be done on the competition 'validation' subset using the competition's server, as + annotations were publicly released only for the 'train' subset of the competition. + + """ + def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, + num_frames: int, max_skip: int): + self.img_folder = img_folder + self.ann_file = ann_file + self._transforms = transforms + self.return_masks = return_masks # not used + self.num_frames = num_frames + self.max_skip = max_skip + # create video meta data + self.prepare_metas() + + print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas)) + print('\n') + + def prepare_metas(self): + # read object information + with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f: + subset_metas_by_video = json.load(f)['videos'] + + # read expression data + with open(str(self.ann_file), 'r') as f: + subset_expressions_by_video = json.load(f)['videos'] + self.videos = list(subset_expressions_by_video.keys()) + + self.metas = [] + for vid in self.videos: + vid_meta = subset_metas_by_video[vid] + vid_data = subset_expressions_by_video[vid] + vid_frames = sorted(vid_data['frames']) + vid_len = len(vid_frames) + + print(vid_meta) + + for exp_id, exp_dict in vid_data['expressions'].items(): + print(exp_dict) + # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2) + start_idx , end_idx = 2, vid_len-2 + bin_size = (end_idx - start_idx) // 4 + + bins = [] + for i in range(4): + bin_start = start_idx + i * bin_size + bin_end = bin_start + bin_size if i < 3 else end_idx + + bins.append((bin_start, bin_end)) + + + meta = { + 'video': vid, + 'exp': exp_dict['exp'], + 'obj_id': int(exp_dict['obj_id']), + 'frames': vid_frames, + 'bins': bins, + 'category': vid_meta['objects'][exp_dict['obj_id']]['category'] + } + self.metas.append(meta) + + + @staticmethod + def bounding_box(img): + rows = np.any(img, axis=1) + cols = np.any(img, axis=0) + rmin, rmax = np.where(rows)[0][[0, -1]] + cmin, cmax = np.where(cols)[0][[0, -1]] + return rmin, rmax, cmin, cmax # y1, y2, x1, x2 + + def __len__(self): + return len(self.metas) + + def __getitem__(self, idx): + instance_check = False + while not instance_check: + meta = self.metas[idx] # dict + + + video, exp, obj_id, category, frames, bins = \ + meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins'] + + + # clean up the caption + exp = " ".join(exp.lower().split()) + category_id = category_dict[category] + vid_len = len(frames) + + # num_frames = self.num_frames + + # Random sample one frame from each bin + sample_indx = [] + for start_idx, end_idx in bins: + sample_indx.append(random.randint(start_idx, end_idx - 1)) + sample_indx.sort() # Ensure indices are in order + + # read frames and masks + imgs, labels, boxes, masks, valid = [], [], [], [], [] + for frame_indx in sample_indx: + frame_name = frames[frame_indx] + img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg') + mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png') + img = Image.open(img_path).convert('RGB') + mask = Image.open(mask_path).convert('P') + + # create the target + label = torch.tensor(category_id) + mask = np.array(mask) + mask = (mask==obj_id).astype(np.float32) # 0,1 binary + if (mask > 0).any(): + y1, y2, x1, x2 = self.bounding_box(mask) + box = torch.tensor([x1, y1, x2, y2]).to(torch.float) + valid.append(1) + else: # some frame didn't contain the instance + box = torch.tensor([0, 0, 0, 0]).to(torch.float) + valid.append(0) + mask = torch.from_numpy(mask) + + # append + imgs.append(img) + labels.append(label) + masks.append(mask) + boxes.append(box) + + # transform + w, h = img.size + labels = torch.stack(labels, dim=0) + boxes = torch.stack(boxes, dim=0) + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + masks = torch.stack(masks, dim=0) + target = { + 'frames_idx': torch.tensor(sample_indx), # [T,] + 'labels': labels, # [T,] + 'boxes': boxes, # [T, 4], xyxy + 'masks': masks, # [T, H, W] + 'valid': torch.tensor(valid), # [T,] + 'caption': exp, + 'orig_size': torch.as_tensor([int(h), int(w)]), + 'size': torch.as_tensor([int(h), int(w)]) + } + + # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform + if self._transforms: + imgs, target = self._transforms(imgs, target) + imgs = torch.stack(imgs, dim=0) # [T, 3, H, W] + else: + imgs = np.array(imgs) + imgs = torch.tensor(imgs.transpose(0, 3, 1, 2)) + + + # FIXME: handle "valid", since some box may be removed due to random crop + if torch.any(target['valid'] == 1): # at leatst one instance + instance_check = True + else: + idx = random.randint(0, self.__len__() - 1) + + return imgs, target + + +def make_coco_transforms(image_set, max_size=640): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [288, 320, 352, 392, 416, 448, 480, 512] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.PhotometricDistort(), + T.RandomSelect( + T.Compose([ + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]) + ), + normalize, + ]) + + # we do not use the 'val' set since the annotations are inaccessible + if image_set == 'val': + return T.Compose([ + T.RandomResize([360], max_size=640), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.ytvos_path) + assert root.exists(), f'provided YTVOS path {root} does not exist' + PATHS = { + "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"), + "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually + } + img_folder, ann_file = PATHS[image_set] + # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, + # num_frames=args.num_frames, max_skip=args.max_skip) + dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, + num_frames=args.num_frames, max_skip=args.max_skip) + return dataset + diff --git a/.history/datasets/ytvos_ref_20250113141118.py b/.history/datasets/ytvos_ref_20250113141118.py new file mode 100644 index 0000000000000000000000000000000000000000..2ac73aa31f23d2f6777ff1252b84ca7080f02b50 --- /dev/null +++ b/.history/datasets/ytvos_ref_20250113141118.py @@ -0,0 +1,241 @@ +""" +Ref-YoutubeVOS data loader +""" +from pathlib import Path + +import torch +from torch.autograd.grad_mode import F +from torch.utils.data import Dataset +import datasets.transforms_video as T + +import os +from PIL import Image +import json +import numpy as np +import random + +from datasets.categories import ytvos_category_dict as category_dict + + +class YTVOSDataset(Dataset): + """ + A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper: + "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark" + (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf). + The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first + dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download + through the Youtube-VOS referring video object segmentation competition page at: + https://competitions.codalab.org/competitions/29139 + Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into + two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can + currently only be done on the competition 'validation' subset using the competition's server, as + annotations were publicly released only for the 'train' subset of the competition. + + """ + def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, + num_frames: int, max_skip: int): + self.img_folder = img_folder + self.ann_file = ann_file + self._transforms = transforms + self.return_masks = return_masks # not used + self.num_frames = num_frames + self.max_skip = max_skip + # create video meta data + self.prepare_metas() + + print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas)) + print('\n') + + def prepare_metas(self): + # read object information + with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f: + subset_metas_by_video = json.load(f)['videos'] + + # read expression data + with open(str(self.ann_file), 'r') as f: + subset_expressions_by_video = json.load(f)['videos'] + self.videos = list(subset_expressions_by_video.keys()) + + self.metas = [] + for vid in self.videos: + vid_meta = subset_metas_by_video[vid] + vid_data = subset_expressions_by_video[vid] + vid_frames = sorted(vid_data['frames']) + vid_len = len(vid_frames) + print(vid_meta) + print(vid_data) + + + for exp_id, exp_dict in vid_data['expressions'].items(): + # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2) + start_idx , end_idx = 2, vid_len-2 + bin_size = (end_idx - start_idx) // 4 + + bins = [] + for i in range(4): + bin_start = start_idx + i * bin_size + bin_end = bin_start + bin_size if i < 3 else end_idx + + bins.append((bin_start, bin_end)) + + + meta = { + 'video': vid, + 'exp': exp_dict['exp'], + 'obj_id': int(exp_dict['obj_id']), + 'frames': vid_frames, + 'bins': bins, + 'category': vid_meta['objects'][exp_dict['obj_id']]['category'] + } + self.metas.append(meta) + + + @staticmethod + def bounding_box(img): + rows = np.any(img, axis=1) + cols = np.any(img, axis=0) + rmin, rmax = np.where(rows)[0][[0, -1]] + cmin, cmax = np.where(cols)[0][[0, -1]] + return rmin, rmax, cmin, cmax # y1, y2, x1, x2 + + def __len__(self): + return len(self.metas) + + def __getitem__(self, idx): + instance_check = False + while not instance_check: + meta = self.metas[idx] # dict + + + video, exp, obj_id, category, frames, bins = \ + meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins'] + + + # clean up the caption + exp = " ".join(exp.lower().split()) + category_id = category_dict[category] + vid_len = len(frames) + + # num_frames = self.num_frames + + # Random sample one frame from each bin + sample_indx = [] + for start_idx, end_idx in bins: + sample_indx.append(random.randint(start_idx, end_idx - 1)) + sample_indx.sort() # Ensure indices are in order + + # read frames and masks + imgs, labels, boxes, masks, valid = [], [], [], [], [] + for frame_indx in sample_indx: + frame_name = frames[frame_indx] + img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg') + mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png') + img = Image.open(img_path).convert('RGB') + mask = Image.open(mask_path).convert('P') + + # create the target + label = torch.tensor(category_id) + mask = np.array(mask) + mask = (mask==obj_id).astype(np.float32) # 0,1 binary + if (mask > 0).any(): + y1, y2, x1, x2 = self.bounding_box(mask) + box = torch.tensor([x1, y1, x2, y2]).to(torch.float) + valid.append(1) + else: # some frame didn't contain the instance + box = torch.tensor([0, 0, 0, 0]).to(torch.float) + valid.append(0) + mask = torch.from_numpy(mask) + + # append + imgs.append(img) + labels.append(label) + masks.append(mask) + boxes.append(box) + + # transform + w, h = img.size + labels = torch.stack(labels, dim=0) + boxes = torch.stack(boxes, dim=0) + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + masks = torch.stack(masks, dim=0) + target = { + 'frames_idx': torch.tensor(sample_indx), # [T,] + 'labels': labels, # [T,] + 'boxes': boxes, # [T, 4], xyxy + 'masks': masks, # [T, H, W] + 'valid': torch.tensor(valid), # [T,] + 'caption': exp, + 'orig_size': torch.as_tensor([int(h), int(w)]), + 'size': torch.as_tensor([int(h), int(w)]) + } + + # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform + if self._transforms: + imgs, target = self._transforms(imgs, target) + imgs = torch.stack(imgs, dim=0) # [T, 3, H, W] + else: + imgs = np.array(imgs) + imgs = torch.tensor(imgs.transpose(0, 3, 1, 2)) + + + # FIXME: handle "valid", since some box may be removed due to random crop + if torch.any(target['valid'] == 1): # at leatst one instance + instance_check = True + else: + idx = random.randint(0, self.__len__() - 1) + + return imgs, target + + +def make_coco_transforms(image_set, max_size=640): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [288, 320, 352, 392, 416, 448, 480, 512] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.PhotometricDistort(), + T.RandomSelect( + T.Compose([ + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]) + ), + normalize, + ]) + + # we do not use the 'val' set since the annotations are inaccessible + if image_set == 'val': + return T.Compose([ + T.RandomResize([360], max_size=640), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.ytvos_path) + assert root.exists(), f'provided YTVOS path {root} does not exist' + PATHS = { + "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"), + "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually + } + img_folder, ann_file = PATHS[image_set] + # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, + # num_frames=args.num_frames, max_skip=args.max_skip) + dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, + num_frames=args.num_frames, max_skip=args.max_skip) + return dataset + diff --git a/.history/datasets/ytvos_ref_20250113162417.py b/.history/datasets/ytvos_ref_20250113162417.py new file mode 100644 index 0000000000000000000000000000000000000000..fb5327725f8bc17793c8e0f94cbb35d7c8d6d9c8 --- /dev/null +++ b/.history/datasets/ytvos_ref_20250113162417.py @@ -0,0 +1,241 @@ +""" +Ref-YoutubeVOS data loader +""" +from pathlib import Path + +import torch +from torch.autograd.grad_mode import F +from torch.utils.data import Dataset +import datasets.transforms_video as T + +import os +from PIL import Image +import json +import numpy as np +import random + +from datasets.categories import ytvos_category_dict as category_dict + + +class YTVOSDataset(Dataset): + """ + A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper: + "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark" + (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf). + The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first + dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download + through the Youtube-VOS referring video object segmentation competition page at: + https://competitions.codalab.org/competitions/29139 + Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into + two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can + currently only be done on the competition 'validation' subset using the competition's server, as + annotations were publicly released only for the 'train' subset of the competition. + + """ + def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, + num_frames: int, max_skip: int): + self.img_folder = img_folder + self.ann_file = ann_file + self._transforms = transforms + self.return_masks = return_masks # not used + self.num_frames = num_frames + self.max_skip = max_skip + # create video meta data + self.prepare_metas() + + print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas)) + print('\n') + + def prepare_metas(self): + # read object information + with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f: + subset_metas_by_video = json.load(f)['videos'] + + # read expression data + with open(str(self.ann_file), 'r') as f: + subset_expressions_by_video = json.load(f)['videos'] + self.videos = list(subset_expressions_by_video.keys()) + + self.metas = [] + for vid in self.videos: + vid_meta = subset_metas_by_video[vid] + vid_data = subset_expressions_by_video[vid] + vid_frames = sorted(vid_data['frames']) + vid_len = len(vid_frames) + + for exp_id, exp_dict in vid_data['expressions'].items(): + # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2) + start_idx , end_idx = 2, vid_len-2 + bin_size = (end_idx - start_idx) // 4 + + bins = [] + for i in range(4): + bin_start = start_idx + i * bin_size + bin_end = bin_start + bin_size if i < 3 else end_idx + + bins.append((bin_start, bin_end)) + + # Random sample one frame from each bin + sample_indx = [] + for start_idx, end_idx in bins: + sample_indx.append(random.randint(start_idx, end_idx - 1)) + sample_indx.sort() # Ensure indices are in order + + + for frame_id in sample_indx: + meta = { + 'video': vid, + 'exp': exp_dict['exp'], + 'obj_id': int(exp_dict['obj_id']), + 'frames': vid_frames, + 'frame_id' : frame_id, + 'sample_frames_id' : sample_indx, + 'bins': bins, + 'category': vid_meta['objects'][exp_dict['obj_id']]['category'] + } + self.metas.append(meta) + + + @staticmethod + def bounding_box(img): + rows = np.any(img, axis=1) + cols = np.any(img, axis=0) + rmin, rmax = np.where(rows)[0][[0, -1]] + cmin, cmax = np.where(cols)[0][[0, -1]] + return rmin, rmax, cmin, cmax # y1, y2, x1, x2 + + def __len__(self): + return len(self.metas) + + def __getitem__(self, idx): + instance_check = False + while not instance_check: + meta = self.metas[idx] # dict + + + video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \ + meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins'] + + + # clean up the caption + exp = " ".join(exp.lower().split()) + category_id = category_dict[category] + vid_len = len(frames) + + # num_frames = self.num_frames + + # read frames and masks + imgs, labels, boxes, masks, valid = [], [], [], [], [] + for frame_indx in sample_frames_id: + frame_name = frames[frame_indx] + img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg') + mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png') + img = Image.open(img_path).convert('RGB') + mask = Image.open(mask_path).convert('P') + + # create the target + label = torch.tensor(category_id) + mask = np.array(mask) + mask = (mask==obj_id).astype(np.float32) # 0,1 binary + if (mask > 0).any(): + y1, y2, x1, x2 = self.bounding_box(mask) + box = torch.tensor([x1, y1, x2, y2]).to(torch.float) + valid.append(1) + else: # some frame didn't contain the instance + box = torch.tensor([0, 0, 0, 0]).to(torch.float) + valid.append(0) + mask = torch.from_numpy(mask) + + # append + imgs.append(img) + labels.append(label) + masks.append(mask) + boxes.append(box) + + # transform + w, h = img.size + labels = torch.stack(labels, dim=0) + boxes = torch.stack(boxes, dim=0) + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + masks = torch.stack(masks, dim=0) + target = { + 'frames_idx': torch.tensor(sample_frames_id), # [T,] + 'labels': labels, # [T,] + 'boxes': boxes, # [T, 4], xyxy + 'masks': masks, # [T, H, W] + 'valid': torch.tensor(valid), # [T,] + 'caption': exp, + 'orig_size': torch.as_tensor([int(h), int(w)]), + 'size': torch.as_tensor([int(h), int(w)]) + } + + # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform + if self._transforms: + imgs, target = self._transforms(imgs, target) + imgs = torch.stack(imgs, dim=0) # [T, 3, H, W] + else: + imgs = np.array(imgs) + imgs = torch.tensor(imgs.transpose(0, 3, 1, 2)) + + + # FIXME: handle "valid", since some box may be removed due to random crop + if torch.any(target['valid'] == 1): # at leatst one instance + instance_check = True + else: + idx = random.randint(0, self.__len__() - 1) + + return imgs, target + + +def make_coco_transforms(image_set, max_size=640): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [288, 320, 352, 392, 416, 448, 480, 512] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.PhotometricDistort(), + T.RandomSelect( + T.Compose([ + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]) + ), + normalize, + ]) + + # we do not use the 'val' set since the annotations are inaccessible + if image_set == 'val': + return T.Compose([ + T.RandomResize([360], max_size=640), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.ytvos_path) + assert root.exists(), f'provided YTVOS path {root} does not exist' + PATHS = { + "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"), + "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually + } + img_folder, ann_file = PATHS[image_set] + # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, + # num_frames=args.num_frames, max_skip=args.max_skip) + dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, + num_frames=args.num_frames, max_skip=args.max_skip) + return dataset + diff --git a/.history/datasets/ytvos_ref_20250113163313.py b/.history/datasets/ytvos_ref_20250113163313.py new file mode 100644 index 0000000000000000000000000000000000000000..6e1449d07223d2b2aee04d711fafc424284dfa1b --- /dev/null +++ b/.history/datasets/ytvos_ref_20250113163313.py @@ -0,0 +1,248 @@ +""" +Ref-YoutubeVOS data loader +""" +from pathlib import Path + +import torch +from torch.autograd.grad_mode import F +from torch.utils.data import Dataset +import datasets.transforms_video as T + +import os +from PIL import Image +import json +import numpy as np +import random + +from datasets.categories import ytvos_category_dict as category_dict + + +class YTVOSDataset(Dataset): + """ + A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper: + "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark" + (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf). + The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first + dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download + through the Youtube-VOS referring video object segmentation competition page at: + https://competitions.codalab.org/competitions/29139 + Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into + two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can + currently only be done on the competition 'validation' subset using the competition's server, as + annotations were publicly released only for the 'train' subset of the competition. + + """ + def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, + num_frames: int, max_skip: int): + self.img_folder = img_folder + self.ann_file = ann_file + self._transforms = transforms + self.return_masks = return_masks # not used + self.num_frames = num_frames + self.max_skip = max_skip + # create video meta data + self.prepare_metas() + + print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas)) + print('\n') + + def prepare_metas(self): + # read object information + with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f: + subset_metas_by_video = json.load(f)['videos'] + + # read expression data + with open(str(self.ann_file), 'r') as f: + subset_expressions_by_video = json.load(f)['videos'] + self.videos = list(subset_expressions_by_video.keys()) + + self.metas = [] + skip_vid_count = 0 + + for vid in self.videos: + vid_meta = subset_metas_by_video[vid] + vid_data = subset_expressions_by_video[vid] + vid_frames = sorted(vid_data['frames']) + vid_len = len(vid_frames) + + if vid_len < 11: + print(f"Too short video: {vid} with frame length {vid_len}") + skip_vid_count += 1 + continue + + for exp_id, exp_dict in vid_data['expressions'].items(): + # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2) + start_idx , end_idx = 2, vid_len-2 + bin_size = (end_idx - start_idx) // 4 + + bins = [] + for i in range(4): + bin_start = start_idx + i * bin_size + bin_end = bin_start + bin_size if i < 3 else end_idx + + bins.append((bin_start, bin_end)) + + # Random sample one frame from each bin + sample_indx = [] + for start_idx, end_idx in bins: + sample_indx.append(random.randint(start_idx, end_idx - 1)) + sample_indx.sort() # Ensure indices are in order + + + for frame_id in sample_indx: + meta = { + 'video': vid, + 'exp': exp_dict['exp'], + 'obj_id': int(exp_dict['obj_id']), + 'frames': vid_frames, + 'frame_id' : frame_id, + 'sample_frames_id' : sample_indx, + 'bins': bins, + 'category': vid_meta['objects'][exp_dict['obj_id']]['category'] + } + self.metas.append(meta) + + + @staticmethod + def bounding_box(img): + rows = np.any(img, axis=1) + cols = np.any(img, axis=0) + rmin, rmax = np.where(rows)[0][[0, -1]] + cmin, cmax = np.where(cols)[0][[0, -1]] + return rmin, rmax, cmin, cmax # y1, y2, x1, x2 + + def __len__(self): + return len(self.metas) + + def __getitem__(self, idx): + instance_check = False + while not instance_check: + meta = self.metas[idx] # dict + + + video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \ + meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins'] + + + # clean up the caption + exp = " ".join(exp.lower().split()) + category_id = category_dict[category] + vid_len = len(frames) + + # num_frames = self.num_frames + + # read frames and masks + imgs, labels, boxes, masks, valid = [], [], [], [], [] + for frame_indx in sample_frames_id: + frame_name = frames[frame_indx] + img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg') + mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png') + img = Image.open(img_path).convert('RGB') + mask = Image.open(mask_path).convert('P') + + # create the target + label = torch.tensor(category_id) + mask = np.array(mask) + mask = (mask==obj_id).astype(np.float32) # 0,1 binary + if (mask > 0).any(): + y1, y2, x1, x2 = self.bounding_box(mask) + box = torch.tensor([x1, y1, x2, y2]).to(torch.float) + valid.append(1) + else: # some frame didn't contain the instance + box = torch.tensor([0, 0, 0, 0]).to(torch.float) + valid.append(0) + mask = torch.from_numpy(mask) + + # append + imgs.append(img) + labels.append(label) + masks.append(mask) + boxes.append(box) + + # transform + w, h = img.size + labels = torch.stack(labels, dim=0) + boxes = torch.stack(boxes, dim=0) + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + masks = torch.stack(masks, dim=0) + target = { + 'frames_idx': torch.tensor(sample_frames_id), # [T,] + 'labels': labels, # [T,] + 'boxes': boxes, # [T, 4], xyxy + 'masks': masks, # [T, H, W] + 'valid': torch.tensor(valid), # [T,] + 'caption': exp, + 'orig_size': torch.as_tensor([int(h), int(w)]), + 'size': torch.as_tensor([int(h), int(w)]) + } + + # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform + if self._transforms: + imgs, target = self._transforms(imgs, target) + imgs = torch.stack(imgs, dim=0) # [T, 3, H, W] + else: + imgs = np.array(imgs) + imgs = torch.tensor(imgs.transpose(0, 3, 1, 2)) + + + # FIXME: handle "valid", since some box may be removed due to random crop + if torch.any(target['valid'] == 1): # at leatst one instance + instance_check = True + else: + idx = random.randint(0, self.__len__() - 1) + + return imgs, target + + +def make_coco_transforms(image_set, max_size=640): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [288, 320, 352, 392, 416, 448, 480, 512] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.PhotometricDistort(), + T.RandomSelect( + T.Compose([ + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]) + ), + normalize, + ]) + + # we do not use the 'val' set since the annotations are inaccessible + if image_set == 'val': + return T.Compose([ + T.RandomResize([360], max_size=640), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.ytvos_path) + assert root.exists(), f'provided YTVOS path {root} does not exist' + PATHS = { + "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"), + "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually + } + img_folder, ann_file = PATHS[image_set] + # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, + # num_frames=args.num_frames, max_skip=args.max_skip) + dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, + num_frames=args.num_frames, max_skip=args.max_skip) + return dataset + diff --git a/.history/datasets/ytvos_ref_20250114201904.py b/.history/datasets/ytvos_ref_20250114201904.py new file mode 100644 index 0000000000000000000000000000000000000000..8fe76d554794a61fb11e7c5cdb4e1d68592e32e2 --- /dev/null +++ b/.history/datasets/ytvos_ref_20250114201904.py @@ -0,0 +1,252 @@ +""" +Ref-YoutubeVOS data loader +""" +from pathlib import Path + +import torch +from torch.autograd.grad_mode import F +from torch.utils.data import Dataset +import datasets.transforms_video as T + +import os +from PIL import Image +import json +import numpy as np +import random + +from datasets.categories import ytvos_category_dict as category_dict + + +class YTVOSDataset(Dataset): + """ + A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper: + "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark" + (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf). + The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first + dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download + through the Youtube-VOS referring video object segmentation competition page at: + https://competitions.codalab.org/competitions/29139 + Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into + two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can + currently only be done on the competition 'validation' subset using the competition's server, as + annotations were publicly released only for the 'train' subset of the competition. + + """ + def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, + num_frames: int, max_skip: int): + self.img_folder = img_folder + self.ann_file = ann_file + self._transforms = transforms + self.return_masks = return_masks # not used + self.num_frames = num_frames + self.max_skip = max_skip + # create video meta data + self.prepare_metas() + + print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas)) + print('\n') + + def prepare_metas(self): + # read object information + with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f: + subset_metas_by_video = json.load(f)['videos'] + + # read expression data + with open(str(self.ann_file), 'r') as f: + subset_expressions_by_video = json.load(f)['videos'] + self.videos = list(subset_expressions_by_video.keys()) + + self.metas = [] + skip_vid_count = 0 + + for vid in self.videos: + vid_meta = subset_metas_by_video[vid] + vid_data = subset_expressions_by_video[vid] + vid_frames = sorted(vid_data['frames']) + vid_len = len(vid_frames) + + if vid_len < 11: + #print(f"Too short video: {vid} with frame length {vid_len}") + skip_vid_count += 1 + continue + + print(f"vid_data: {vid_data}") + print(f"vid_meta: {vid_meta}") + for exp_id, exp_dict in vid_data['expressions'].items(): + # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2) + start_idx , end_idx = 2, vid_len-2 + bin_size = (end_idx - start_idx) // 4 + + bins = [] + for i in range(4): + bin_start = start_idx + i * bin_size + bin_end = bin_start + bin_size if i < 3 else end_idx + + bins.append((bin_start, bin_end)) + + # Random sample one frame from each bin + sample_indx = [] + for start_idx, end_idx in bins: + sample_indx.append(random.randint(start_idx, end_idx - 1)) + sample_indx.sort() # Ensure indices are in order + + + for sample_id in sample_indx: + meta = { + 'video': vid, + 'exp': exp_dict['exp'], + 'obj_id': int(exp_dict['obj_id']), + 'frames': vid_frames, + 'sample_id' : sample_id, + 'sample_frames_id' : sample_indx, + 'bins': bins, + 'category': vid_meta['objects'][exp_dict['obj_id']]['category'] + } + self.metas.append(meta) + + print(f"skipped {skip_vid_count} short videos") + + + @staticmethod + def bounding_box(img): + rows = np.any(img, axis=1) + cols = np.any(img, axis=0) + rmin, rmax = np.where(rows)[0][[0, -1]] + cmin, cmax = np.where(cols)[0][[0, -1]] + return rmin, rmax, cmin, cmax # y1, y2, x1, x2 + + def __len__(self): + return len(self.metas) + + def __getitem__(self, idx): + instance_check = False + while not instance_check: + meta = self.metas[idx] # dict + + + video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \ + meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins'] + + + # clean up the caption + exp = " ".join(exp.lower().split()) + category_id = category_dict[category] + vid_len = len(frames) + + # num_frames = self.num_frames + + # read frames and masks + imgs, labels, boxes, masks, valid = [], [], [], [], [] + for frame_indx in sample_frames_id: + frame_name = frames[frame_indx] + img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg') + mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png') + img = Image.open(img_path).convert('RGB') + mask = Image.open(mask_path).convert('P') + + # create the target + label = torch.tensor(category_id) + mask = np.array(mask) + mask = (mask==obj_id).astype(np.float32) # 0,1 binary + if (mask > 0).any(): + y1, y2, x1, x2 = self.bounding_box(mask) + box = torch.tensor([x1, y1, x2, y2]).to(torch.float) + valid.append(1) + else: # some frame didn't contain the instance + box = torch.tensor([0, 0, 0, 0]).to(torch.float) + valid.append(0) + mask = torch.from_numpy(mask) + + # append + imgs.append(img) + labels.append(label) + masks.append(mask) + boxes.append(box) + + # transform + w, h = img.size + labels = torch.stack(labels, dim=0) + boxes = torch.stack(boxes, dim=0) + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + masks = torch.stack(masks, dim=0) + target = { + 'frames_idx': torch.tensor(sample_frames_id), # [T,] + 'labels': labels, # [T,] + 'boxes': boxes, # [T, 4], xyxy + 'masks': masks, # [T, H, W] + 'valid': torch.tensor(valid), # [T,] + 'caption': exp, + 'orig_size': torch.as_tensor([int(h), int(w)]), + 'size': torch.as_tensor([int(h), int(w)]) + } + + # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform + if self._transforms: + imgs, target = self._transforms(imgs, target) + imgs = torch.stack(imgs, dim=0) # [T, 3, H, W] + else: + imgs = np.array(imgs) + imgs = torch.tensor(imgs.transpose(0, 3, 1, 2)) + + + # FIXME: handle "valid", since some box may be removed due to random crop + if torch.any(target['valid'] == 1): # at leatst one instance + instance_check = True + else: + idx = random.randint(0, self.__len__() - 1) + + return imgs, target + + +def make_coco_transforms(image_set, max_size=640): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [288, 320, 352, 392, 416, 448, 480, 512] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.PhotometricDistort(), + T.RandomSelect( + T.Compose([ + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]) + ), + normalize, + ]) + + # we do not use the 'val' set since the annotations are inaccessible + if image_set == 'val': + return T.Compose([ + T.RandomResize([360], max_size=640), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.ytvos_path) + assert root.exists(), f'provided YTVOS path {root} does not exist' + PATHS = { + "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"), + "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually + } + img_folder, ann_file = PATHS[image_set] + # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, + # num_frames=args.num_frames, max_skip=args.max_skip) + dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, + num_frames=args.num_frames, max_skip=args.max_skip) + return dataset + diff --git a/.history/datasets/ytvos_ref_20250114201908.py b/.history/datasets/ytvos_ref_20250114201908.py new file mode 100644 index 0000000000000000000000000000000000000000..f566a830e7ae4a35219b6e3034787a803bf83ea7 --- /dev/null +++ b/.history/datasets/ytvos_ref_20250114201908.py @@ -0,0 +1,253 @@ +""" +Ref-YoutubeVOS data loader +""" +from pathlib import Path + +import torch +from torch.autograd.grad_mode import F +from torch.utils.data import Dataset +import datasets.transforms_video as T + +import os +from PIL import Image +import json +import numpy as np +import random + +from datasets.categories import ytvos_category_dict as category_dict + + +class YTVOSDataset(Dataset): + """ + A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper: + "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark" + (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf). + The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first + dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download + through the Youtube-VOS referring video object segmentation competition page at: + https://competitions.codalab.org/competitions/29139 + Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into + two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can + currently only be done on the competition 'validation' subset using the competition's server, as + annotations were publicly released only for the 'train' subset of the competition. + + """ + def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, + num_frames: int, max_skip: int): + self.img_folder = img_folder + self.ann_file = ann_file + self._transforms = transforms + self.return_masks = return_masks # not used + self.num_frames = num_frames + self.max_skip = max_skip + # create video meta data + self.prepare_metas() + + print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas)) + print('\n') + + def prepare_metas(self): + # read object information + with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f: + subset_metas_by_video = json.load(f)['videos'] + + # read expression data + with open(str(self.ann_file), 'r') as f: + subset_expressions_by_video = json.load(f)['videos'] + self.videos = list(subset_expressions_by_video.keys()) + + self.metas = [] + skip_vid_count = 0 + + for vid in self.videos: + vid_meta = subset_metas_by_video[vid] + vid_data = subset_expressions_by_video[vid] + vid_frames = sorted(vid_data['frames']) + vid_len = len(vid_frames) + + if vid_len < 11: + #print(f"Too short video: {vid} with frame length {vid_len}") + skip_vid_count += 1 + continue + + print(f"vid_data: {vid_data}") + print(f"vid_meta: {vid_meta}") + + for exp_id, exp_dict in vid_data['expressions'].items(): + # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2) + start_idx , end_idx = 2, vid_len-2 + bin_size = (end_idx - start_idx) // 4 + + bins = [] + for i in range(4): + bin_start = start_idx + i * bin_size + bin_end = bin_start + bin_size if i < 3 else end_idx + + bins.append((bin_start, bin_end)) + + # Random sample one frame from each bin + sample_indx = [] + for start_idx, end_idx in bins: + sample_indx.append(random.randint(start_idx, end_idx - 1)) + sample_indx.sort() # Ensure indices are in order + + + for sample_id in sample_indx: + meta = { + 'video': vid, + 'exp': exp_dict['exp'], + 'obj_id': int(exp_dict['obj_id']), + 'frames': vid_frames, + 'sample_id' : sample_id, + 'sample_frames_id' : sample_indx, + 'bins': bins, + 'category': vid_meta['objects'][exp_dict['obj_id']]['category'] + } + self.metas.append(meta) + + print(f"skipped {skip_vid_count} short videos") + + + @staticmethod + def bounding_box(img): + rows = np.any(img, axis=1) + cols = np.any(img, axis=0) + rmin, rmax = np.where(rows)[0][[0, -1]] + cmin, cmax = np.where(cols)[0][[0, -1]] + return rmin, rmax, cmin, cmax # y1, y2, x1, x2 + + def __len__(self): + return len(self.metas) + + def __getitem__(self, idx): + instance_check = False + while not instance_check: + meta = self.metas[idx] # dict + + + video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \ + meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins'] + + + # clean up the caption + exp = " ".join(exp.lower().split()) + category_id = category_dict[category] + vid_len = len(frames) + + # num_frames = self.num_frames + + # read frames and masks + imgs, labels, boxes, masks, valid = [], [], [], [], [] + for frame_indx in sample_frames_id: + frame_name = frames[frame_indx] + img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg') + mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png') + img = Image.open(img_path).convert('RGB') + mask = Image.open(mask_path).convert('P') + + # create the target + label = torch.tensor(category_id) + mask = np.array(mask) + mask = (mask==obj_id).astype(np.float32) # 0,1 binary + if (mask > 0).any(): + y1, y2, x1, x2 = self.bounding_box(mask) + box = torch.tensor([x1, y1, x2, y2]).to(torch.float) + valid.append(1) + else: # some frame didn't contain the instance + box = torch.tensor([0, 0, 0, 0]).to(torch.float) + valid.append(0) + mask = torch.from_numpy(mask) + + # append + imgs.append(img) + labels.append(label) + masks.append(mask) + boxes.append(box) + + # transform + w, h = img.size + labels = torch.stack(labels, dim=0) + boxes = torch.stack(boxes, dim=0) + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + masks = torch.stack(masks, dim=0) + target = { + 'frames_idx': torch.tensor(sample_frames_id), # [T,] + 'labels': labels, # [T,] + 'boxes': boxes, # [T, 4], xyxy + 'masks': masks, # [T, H, W] + 'valid': torch.tensor(valid), # [T,] + 'caption': exp, + 'orig_size': torch.as_tensor([int(h), int(w)]), + 'size': torch.as_tensor([int(h), int(w)]) + } + + # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform + if self._transforms: + imgs, target = self._transforms(imgs, target) + imgs = torch.stack(imgs, dim=0) # [T, 3, H, W] + else: + imgs = np.array(imgs) + imgs = torch.tensor(imgs.transpose(0, 3, 1, 2)) + + + # FIXME: handle "valid", since some box may be removed due to random crop + if torch.any(target['valid'] == 1): # at leatst one instance + instance_check = True + else: + idx = random.randint(0, self.__len__() - 1) + + return imgs, target + + +def make_coco_transforms(image_set, max_size=640): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [288, 320, 352, 392, 416, 448, 480, 512] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.PhotometricDistort(), + T.RandomSelect( + T.Compose([ + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]) + ), + normalize, + ]) + + # we do not use the 'val' set since the annotations are inaccessible + if image_set == 'val': + return T.Compose([ + T.RandomResize([360], max_size=640), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.ytvos_path) + assert root.exists(), f'provided YTVOS path {root} does not exist' + PATHS = { + "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"), + "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually + } + img_folder, ann_file = PATHS[image_set] + # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, + # num_frames=args.num_frames, max_skip=args.max_skip) + dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, + num_frames=args.num_frames, max_skip=args.max_skip) + return dataset + diff --git a/.history/datasets/ytvos_ref_20250114202340.py b/.history/datasets/ytvos_ref_20250114202340.py new file mode 100644 index 0000000000000000000000000000000000000000..2df05592439e77dd69a7854ae746dda6cab910c7 --- /dev/null +++ b/.history/datasets/ytvos_ref_20250114202340.py @@ -0,0 +1,251 @@ +""" +Ref-YoutubeVOS data loader +""" +from pathlib import Path + +import torch +from torch.autograd.grad_mode import F +from torch.utils.data import Dataset +import datasets.transforms_video as T + +import os +from PIL import Image +import json +import numpy as np +import random + +from datasets.categories import ytvos_category_dict as category_dict + + +class YTVOSDataset(Dataset): + """ + A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper: + "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark" + (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf). + The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first + dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download + through the Youtube-VOS referring video object segmentation competition page at: + https://competitions.codalab.org/competitions/29139 + Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into + two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can + currently only be done on the competition 'validation' subset using the competition's server, as + annotations were publicly released only for the 'train' subset of the competition. + + """ + def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, + num_frames: int, max_skip: int): + self.img_folder = img_folder + self.ann_file = ann_file + self._transforms = transforms + self.return_masks = return_masks # not used + self.num_frames = num_frames + self.max_skip = max_skip + # create video meta data + self.vid_data, self.vid_meta = self.prepare_metas() + + print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas)) + print('\n') + + def prepare_metas(self): + # read object information + with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f: + subset_metas_by_video = json.load(f)['videos'] + + # read expression data + with open(str(self.ann_file), 'r') as f: + subset_expressions_by_video = json.load(f)['videos'] + self.videos = list(subset_expressions_by_video.keys()) + + self.metas = [] + skip_vid_count = 0 + + for vid in self.videos: + vid_meta = subset_metas_by_video[vid] + vid_data = subset_expressions_by_video[vid] + vid_frames = sorted(vid_data['frames']) + vid_len = len(vid_frames) + + if vid_len < 11: + #print(f"Too short video: {vid} with frame length {vid_len}") + skip_vid_count += 1 + continue + + return vid_meta, vid_data + for exp_id, exp_dict in vid_data['expressions'].items(): + # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2) + start_idx , end_idx = 2, vid_len-2 + bin_size = (end_idx - start_idx) // 4 + + bins = [] + for i in range(4): + bin_start = start_idx + i * bin_size + bin_end = bin_start + bin_size if i < 3 else end_idx + + bins.append((bin_start, bin_end)) + + # Random sample one frame from each bin + sample_indx = [] + for start_idx, end_idx in bins: + sample_indx.append(random.randint(start_idx, end_idx - 1)) + sample_indx.sort() # Ensure indices are in order + + + for sample_id in sample_indx: + meta = { + 'video': vid, + 'exp': exp_dict['exp'], + 'obj_id': int(exp_dict['obj_id']), + 'frames': vid_frames, + 'sample_id' : sample_id, + 'sample_frames_id' : sample_indx, + 'bins': bins, + 'category': vid_meta['objects'][exp_dict['obj_id']]['category'] + } + self.metas.append(meta) + + print(f"skipped {skip_vid_count} short videos") + + + @staticmethod + def bounding_box(img): + rows = np.any(img, axis=1) + cols = np.any(img, axis=0) + rmin, rmax = np.where(rows)[0][[0, -1]] + cmin, cmax = np.where(cols)[0][[0, -1]] + return rmin, rmax, cmin, cmax # y1, y2, x1, x2 + + def __len__(self): + return len(self.metas) + + def __getitem__(self, idx): + instance_check = False + while not instance_check: + meta = self.metas[idx] # dict + + + video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \ + meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins'] + + + # clean up the caption + exp = " ".join(exp.lower().split()) + category_id = category_dict[category] + vid_len = len(frames) + + # num_frames = self.num_frames + + # read frames and masks + imgs, labels, boxes, masks, valid = [], [], [], [], [] + for frame_indx in sample_frames_id: + frame_name = frames[frame_indx] + img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg') + mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png') + img = Image.open(img_path).convert('RGB') + mask = Image.open(mask_path).convert('P') + + # create the target + label = torch.tensor(category_id) + mask = np.array(mask) + mask = (mask==obj_id).astype(np.float32) # 0,1 binary + if (mask > 0).any(): + y1, y2, x1, x2 = self.bounding_box(mask) + box = torch.tensor([x1, y1, x2, y2]).to(torch.float) + valid.append(1) + else: # some frame didn't contain the instance + box = torch.tensor([0, 0, 0, 0]).to(torch.float) + valid.append(0) + mask = torch.from_numpy(mask) + + # append + imgs.append(img) + labels.append(label) + masks.append(mask) + boxes.append(box) + + # transform + w, h = img.size + labels = torch.stack(labels, dim=0) + boxes = torch.stack(boxes, dim=0) + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + masks = torch.stack(masks, dim=0) + target = { + 'frames_idx': torch.tensor(sample_frames_id), # [T,] + 'labels': labels, # [T,] + 'boxes': boxes, # [T, 4], xyxy + 'masks': masks, # [T, H, W] + 'valid': torch.tensor(valid), # [T,] + 'caption': exp, + 'orig_size': torch.as_tensor([int(h), int(w)]), + 'size': torch.as_tensor([int(h), int(w)]) + } + + # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform + if self._transforms: + imgs, target = self._transforms(imgs, target) + imgs = torch.stack(imgs, dim=0) # [T, 3, H, W] + else: + imgs = np.array(imgs) + imgs = torch.tensor(imgs.transpose(0, 3, 1, 2)) + + + # FIXME: handle "valid", since some box may be removed due to random crop + if torch.any(target['valid'] == 1): # at leatst one instance + instance_check = True + else: + idx = random.randint(0, self.__len__() - 1) + + return imgs, target + + +def make_coco_transforms(image_set, max_size=640): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [288, 320, 352, 392, 416, 448, 480, 512] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.PhotometricDistort(), + T.RandomSelect( + T.Compose([ + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]) + ), + normalize, + ]) + + # we do not use the 'val' set since the annotations are inaccessible + if image_set == 'val': + return T.Compose([ + T.RandomResize([360], max_size=640), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.ytvos_path) + assert root.exists(), f'provided YTVOS path {root} does not exist' + PATHS = { + "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"), + "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually + } + img_folder, ann_file = PATHS[image_set] + # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, + # num_frames=args.num_frames, max_skip=args.max_skip) + dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, + num_frames=args.num_frames, max_skip=args.max_skip) + return dataset + diff --git a/.history/datasets/ytvos_ref_20250114205314.py b/.history/datasets/ytvos_ref_20250114205314.py new file mode 100644 index 0000000000000000000000000000000000000000..9174bb9ce61fccc70ee309f6b440c0efecfe639c --- /dev/null +++ b/.history/datasets/ytvos_ref_20250114205314.py @@ -0,0 +1,250 @@ +""" +Ref-YoutubeVOS data loader +""" +from pathlib import Path + +import torch +from torch.autograd.grad_mode import F +from torch.utils.data import Dataset +import datasets.transforms_video as T + +import os +from PIL import Image +import json +import numpy as np +import random + +from datasets.categories import ytvos_category_dict as category_dict + + +class YTVOSDataset(Dataset): + """ + A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper: + "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark" + (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf). + The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first + dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download + through the Youtube-VOS referring video object segmentation competition page at: + https://competitions.codalab.org/competitions/29139 + Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into + two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can + currently only be done on the competition 'validation' subset using the competition's server, as + annotations were publicly released only for the 'train' subset of the competition. + + """ + def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, + num_frames: int, max_skip: int): + self.img_folder = img_folder + self.ann_file = ann_file + self._transforms = transforms + self.return_masks = return_masks # not used + self.num_frames = num_frames + self.max_skip = max_skip + # create video meta data + self.prepare_metas() + + print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas)) + print('\n') + + def prepare_metas(self): + # read object information + with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f: + subset_metas_by_video = json.load(f)['videos'] + + # read expression data + with open(str(self.ann_file), 'r') as f: + subset_expressions_by_video = json.load(f)['videos'] + self.videos = list(subset_expressions_by_video.keys()) + + self.metas = [] + skip_vid_count = 0 + + for vid in self.videos: + vid_meta = subset_metas_by_video[vid] + vid_data = subset_expressions_by_video[vid] + vid_frames = sorted(vid_data['frames']) + vid_len = len(vid_frames) + + if vid_len < 11: + #print(f"Too short video: {vid} with frame length {vid_len}") + skip_vid_count += 1 + continue + + for exp_id, exp_dict in vid_data['expressions'].items(): + # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2) + start_idx , end_idx = 2, vid_len-2 + bin_size = (end_idx - start_idx) // 4 + + bins = [] + for i in range(4): + bin_start = start_idx + i * bin_size + bin_end = bin_start + bin_size if i < 3 else end_idx + + bins.append((bin_start, bin_end)) + + # Random sample one frame from each bin + sample_indx = [] + for start_idx, end_idx in bins: + sample_indx.append(random.randint(start_idx, end_idx - 1)) + sample_indx.sort() # Ensure indices are in order + + + for sample_id in sample_indx: + meta = { + 'video': vid, + 'exp': exp_dict['exp'], + 'obj_id': int(exp_dict['obj_id']), + 'frames': vid_frames, + 'sample_id' : sample_id, + 'sample_frames_id' : sample_indx, + 'bins': bins, + 'category': vid_meta['objects'][exp_dict['obj_id']]['category'] + } + self.metas.append(meta) + + print(f"skipped {skip_vid_count} short videos") + + + @staticmethod + def bounding_box(img): + rows = np.any(img, axis=1) + cols = np.any(img, axis=0) + rmin, rmax = np.where(rows)[0][[0, -1]] + cmin, cmax = np.where(cols)[0][[0, -1]] + return rmin, rmax, cmin, cmax # y1, y2, x1, x2 + + def __len__(self): + return len(self.metas) + + def __getitem__(self, idx): + instance_check = False + while not instance_check: + meta = self.metas[idx] # dict + + + video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \ + meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins'] + + + # clean up the caption + exp = " ".join(exp.lower().split()) + category_id = category_dict[category] + vid_len = len(frames) + + # num_frames = self.num_frames + + # read frames and masks + imgs, labels, boxes, masks, valid = [], [], [], [], [] + for frame_indx in sample_frames_id: + frame_name = frames[frame_indx] + img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg') + mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png') + img = Image.open(img_path).convert('RGB') + mask = Image.open(mask_path).convert('P') + + # create the target + label = torch.tensor(category_id) + mask = np.array(mask) + mask = (mask==obj_id).astype(np.float32) # 0,1 binary + if (mask > 0).any(): + y1, y2, x1, x2 = self.bounding_box(mask) + box = torch.tensor([x1, y1, x2, y2]).to(torch.float) + valid.append(1) + else: # some frame didn't contain the instance + box = torch.tensor([0, 0, 0, 0]).to(torch.float) + valid.append(0) + mask = torch.from_numpy(mask) + + # append + imgs.append(img) + labels.append(label) + masks.append(mask) + boxes.append(box) + + # transform + w, h = img.size + labels = torch.stack(labels, dim=0) + boxes = torch.stack(boxes, dim=0) + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + masks = torch.stack(masks, dim=0) + target = { + 'frames_idx': torch.tensor(sample_frames_id), # [T,] + 'labels': labels, # [T,] + 'boxes': boxes, # [T, 4], xyxy + 'masks': masks, # [T, H, W] + 'valid': torch.tensor(valid), # [T,] + 'caption': exp, + 'orig_size': torch.as_tensor([int(h), int(w)]), + 'size': torch.as_tensor([int(h), int(w)]) + } + + # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform + if self._transforms: + imgs, target = self._transforms(imgs, target) + imgs = torch.stack(imgs, dim=0) # [T, 3, H, W] + else: + imgs = np.array(imgs) + imgs = torch.tensor(imgs.transpose(0, 3, 1, 2)) + + + # FIXME: handle "valid", since some box may be removed due to random crop + if torch.any(target['valid'] == 1): # at leatst one instance + instance_check = True + else: + idx = random.randint(0, self.__len__() - 1) + + return imgs, target + + +def make_coco_transforms(image_set, max_size=640): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [288, 320, 352, 392, 416, 448, 480, 512] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.PhotometricDistort(), + T.RandomSelect( + T.Compose([ + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]) + ), + normalize, + ]) + + # we do not use the 'val' set since the annotations are inaccessible + if image_set == 'val': + return T.Compose([ + T.RandomResize([360], max_size=640), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.ytvos_path) + assert root.exists(), f'provided YTVOS path {root} does not exist' + PATHS = { + "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"), + "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually + } + img_folder, ann_file = PATHS[image_set] + # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, + # num_frames=args.num_frames, max_skip=args.max_skip) + dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, + num_frames=args.num_frames, max_skip=args.max_skip) + return dataset + diff --git a/.history/datasets/ytvos_ref_20250114211305.py b/.history/datasets/ytvos_ref_20250114211305.py new file mode 100644 index 0000000000000000000000000000000000000000..c2f7cf0846cb2b617527d149fc2ce9a9c99a9430 --- /dev/null +++ b/.history/datasets/ytvos_ref_20250114211305.py @@ -0,0 +1,252 @@ +""" +Ref-YoutubeVOS data loader +""" +from pathlib import Path + +import torch +from torch.autograd.grad_mode import F +from torch.utils.data import Dataset +import datasets.transforms_video as T + +import os +from PIL import Image +import json +import numpy as np +import random + +from datasets.categories import ytvos_category_dict as category_dict + + +class YTVOSDataset(Dataset): + """ + A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper: + "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark" + (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf). + The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first + dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download + through the Youtube-VOS referring video object segmentation competition page at: + https://competitions.codalab.org/competitions/29139 + Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into + two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can + currently only be done on the competition 'validation' subset using the competition's server, as + annotations were publicly released only for the 'train' subset of the competition. + + """ + def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, + num_frames: int, max_skip: int): + self.img_folder = img_folder + self.ann_file = ann_file + self._transforms = transforms + self.return_masks = return_masks # not used + self.num_frames = num_frames + self.max_skip = max_skip + # create video meta data + self.prepare_metas() + + print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas)) + print('\n') + + def prepare_metas(self): + # read object information + with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f: + subset_metas_by_video = json.load(f)['videos'] + + # read expression data + with open(str(self.ann_file), 'r') as f: + subset_expressions_by_video = json.load(f)['videos'] + self.videos = list(subset_expressions_by_video.keys()) + + self.metas = [] + skip_vid_count = 0 + + for vid in self.videos: + vid_meta = subset_metas_by_video[vid] + vid_data = subset_expressions_by_video[vid] + vid_frames = sorted(vid_data['frames']) + vid_len = len(vid_frames) + + if vid_len < 11: + #print(f"Too short video: {vid} with frame length {vid_len}") + skip_vid_count += 1 + continue + + + # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2) + start_idx , end_idx = 2, vid_len-2 + bin_size = (end_idx - start_idx) // 4 + + bins = [] + for i in range(4): + bin_start = start_idx + i * bin_size + bin_end = bin_start + bin_size if i < 3 else end_idx + + bins.append((bin_start, bin_end)) + + # Random sample one frame from each bin + sample_indx = [] + for start_idx, end_idx in bins: + sample_indx.append(random.randint(start_idx, end_idx - 1)) + sample_indx.sort() # Ensure indices are in order + + + meta = { + 'video':vid, + 'sample_indx':sample_indx, + 'bins':bins + } + obj_id_cat = {} + for exp_id, exp_dict in vid_data['expressions'].items(): + obj_id = exp_dict['obj_id'] + print(obj_id, type(obj_id)) + print(vid_meta['objects'].keys()) + if obj_id not in obj_id_cat: + obj_id_cat[obj_id] = vid_meta[obj_id]['category'] + meta['obj_id_cat'] = obj_id_cat + self.metas.append(meta) + + print(f"skipped {skip_vid_count} short videos") + + + @staticmethod + def bounding_box(img): + rows = np.any(img, axis=1) + cols = np.any(img, axis=0) + rmin, rmax = np.where(rows)[0][[0, -1]] + cmin, cmax = np.where(cols)[0][[0, -1]] + return rmin, rmax, cmin, cmax # y1, y2, x1, x2 + + def __len__(self): + return len(self.metas) + + def __getitem__(self, idx): + instance_check = False + while not instance_check: + meta = self.metas[idx] # dict + + + video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \ + meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins'] + + + # clean up the caption + exp = " ".join(exp.lower().split()) + category_id = category_dict[category] + vid_len = len(frames) + + # num_frames = self.num_frames + + # read frames and masks + imgs, labels, boxes, masks, valid = [], [], [], [], [] + for frame_indx in sample_frames_id: + frame_name = frames[frame_indx] + img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg') + mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png') + img = Image.open(img_path).convert('RGB') + mask = Image.open(mask_path).convert('P') + + # create the target + label = torch.tensor(category_id) + mask = np.array(mask) + mask = (mask==obj_id).astype(np.float32) # 0,1 binary + if (mask > 0).any(): + y1, y2, x1, x2 = self.bounding_box(mask) + box = torch.tensor([x1, y1, x2, y2]).to(torch.float) + valid.append(1) + else: # some frame didn't contain the instance + box = torch.tensor([0, 0, 0, 0]).to(torch.float) + valid.append(0) + mask = torch.from_numpy(mask) + + # append + imgs.append(img) + labels.append(label) + masks.append(mask) + boxes.append(box) + + # transform + w, h = img.size + labels = torch.stack(labels, dim=0) + boxes = torch.stack(boxes, dim=0) + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + masks = torch.stack(masks, dim=0) + target = { + 'frames_idx': torch.tensor(sample_frames_id), # [T,] + 'labels': labels, # [T,] + 'boxes': boxes, # [T, 4], xyxy + 'masks': masks, # [T, H, W] + 'valid': torch.tensor(valid), # [T,] + 'caption': exp, + 'orig_size': torch.as_tensor([int(h), int(w)]), + 'size': torch.as_tensor([int(h), int(w)]) + } + + # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform + if self._transforms: + imgs, target = self._transforms(imgs, target) + imgs = torch.stack(imgs, dim=0) # [T, 3, H, W] + else: + imgs = np.array(imgs) + imgs = torch.tensor(imgs.transpose(0, 3, 1, 2)) + + + # FIXME: handle "valid", since some box may be removed due to random crop + if torch.any(target['valid'] == 1): # at leatst one instance + instance_check = True + else: + idx = random.randint(0, self.__len__() - 1) + + return imgs, target + + +def make_coco_transforms(image_set, max_size=640): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [288, 320, 352, 392, 416, 448, 480, 512] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.PhotometricDistort(), + T.RandomSelect( + T.Compose([ + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]) + ), + normalize, + ]) + + # we do not use the 'val' set since the annotations are inaccessible + if image_set == 'val': + return T.Compose([ + T.RandomResize([360], max_size=640), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.ytvos_path) + assert root.exists(), f'provided YTVOS path {root} does not exist' + PATHS = { + "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"), + "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually + } + img_folder, ann_file = PATHS[image_set] + # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, + # num_frames=args.num_frames, max_skip=args.max_skip) + dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, + num_frames=args.num_frames, max_skip=args.max_skip) + return dataset + diff --git a/.history/datasets/ytvos_ref_20250116074326.py b/.history/datasets/ytvos_ref_20250116074326.py new file mode 100644 index 0000000000000000000000000000000000000000..a0df51a8617ac182d3b5b72628751c3858b1d463 --- /dev/null +++ b/.history/datasets/ytvos_ref_20250116074326.py @@ -0,0 +1,239 @@ +""" +Ref-YoutubeVOS data loader +""" +from pathlib import Path + +import torch +from torch.autograd.grad_mode import F +from torch.utils.data import Dataset +import datasets.transforms_video as T + +import os +from PIL import Image +import json +import numpy as np +import random + +from datasets.categories import ytvos_category_dict as category_dict + + +class YTVOSDataset(Dataset): + """ + A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper: + "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark" + (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf). + The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first + dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download + through the Youtube-VOS referring video object segmentation competition page at: + https://competitions.codalab.org/competitions/29139 + Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into + two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can + currently only be done on the competition 'validation' subset using the competition's server, as + annotations were publicly released only for the 'train' subset of the competition. + + """ + def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, + num_frames: int, max_skip: int): + self.img_folder = img_folder + self.ann_file = ann_file + self._transforms = transforms + self.return_masks = return_masks # not used + self.num_frames = num_frames + self.max_skip = max_skip + # create video meta data + self.prepare_metas() + + print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas)) + print('\n') + + def prepare_metas(self): + # read object information + with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f: + subset_metas_by_video = json.load(f)['videos'] + + # read expression data + with open(str(self.ann_file), 'r') as f: + subset_expressions_by_video = json.load(f)['videos'] + self.videos = list(subset_expressions_by_video.keys()) + + self.metas = [] + skip_vid_count = 0 + + for vid in self.videos: + vid_meta = subset_metas_by_video[vid] + vid_data = subset_expressions_by_video[vid] + vid_frames = sorted(vid_data['frames']) + vid_len = len(vid_frames) + + if vid_len < 11: + #print(f"Too short video: {vid} with frame length {vid_len}") + skip_vid_count += 1 + continue + + + # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2) + start_idx , end_idx = 2, vid_len-2 + bin_size = (end_idx - start_idx) // 4 + + bins = [] + for i in range(4): + bin_start = start_idx + i * bin_size + bin_end = bin_start + bin_size if i < 3 else end_idx + + bins.append((bin_start, bin_end)) + + # Random sample one frame from each bin + sample_indx = [] + for start_idx, end_idx in bins: + sample_indx.append(random.randint(start_idx, end_idx - 1)) + sample_indx.sort() # Ensure indices are in order + + + meta = { + 'video':vid, + 'sample_indx':sample_indx, + 'bins':bins, + 'frames':vid_frames + } + obj_id_cat = {} + for exp_id, exp_dict in vid_data['expressions'].items(): + obj_id = exp_dict['obj_id'] + if obj_id not in obj_id_cat: + obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category'] + meta['obj_id_cat'] = obj_id_cat + self.metas.append(meta) + + print(f"skipped {skip_vid_count} short videos") + + + @staticmethod + def bounding_box(img): + rows = np.any(img, axis=1) + cols = np.any(img, axis=0) + rmin, rmax = np.where(rows)[0][[0, -1]] + cmin, cmax = np.where(cols)[0][[0, -1]] + return rmin, rmax, cmin, cmax # y1, y2, x1, x2 + + def __len__(self): + return len(self.metas) + + def __getitem__(self, idx): + meta = self.metas[idx] # dict + + video, sample_indx, bins, frames, obj_id_cat = \ + meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat'] + + # read frames and masks + imgs, labels, boxes, masks, valid = [], [], [], [], [] + for frame_indx in sample_indx: + frame_name = frames[frame_indx] + img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg') + mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png') + img = Image.open(img_path).convert('RGB') + imgs.append(img) + + mask = Image.open(mask_path).convert('P') + mask = np.array(mask) + + # create the target + for obj_id in list(obj_id_cat.keys()): + obj_mask = (mask==int(obj_id)).astype(np.float32) # 0,1 binary + if (obj_mask > 0).any(): + y1, y2, x1, x2 = self.bounding_box(obj_mask) + box = torch.tensor([x1, y1, x2, y2]).to(torch.float) + valid.append(1) + else: # some frame didn't contain the instance + box = torch.tensor([0, 0, 0, 0]).to(torch.float) + valid.append(0) + obj_mask = torch.from_numpy(obj_mask) + + # append + masks.append(obj_mask) + boxes.append(box) + + + # transform + w, h = img.size + boxes = torch.stack(boxes, dim=0) + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + masks = torch.stack(masks, dim=0) + target = { + 'frames_idx': sample_indx, # [T,] + 'boxes': boxes, # [T, 4], xyxy + 'masks': masks, # [T, H, W] + 'valid': torch.tensor(valid), # [T,] + 'obj_ids' : list(obj_id_cat.keys()), + 'orig_size': torch.as_tensor([int(h), int(w)]), + 'size': torch.as_tensor([int(h), int(w)]) + } + + # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform + if self._transforms: + imgs, target = self._transforms(imgs, target) + imgs = torch.stack(imgs, dim=0) # [T, 3, H, W] + else: + imgs = np.array(imgs) + imgs = torch.tensor(imgs.transpose(0, 3, 1, 2)) + + + # # FIXME: handle "valid", since some box may be removed due to random crop + # if torch.any(target['valid'] == 1): # at leatst one instance + # instance_check = True + # else: + # idx = random.randint(0, self.__len__() - 1) + + return imgs, target + + +def make_coco_transforms(image_set, max_size=640): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [288, 320, 352, 392, 416, 448, 480, 512] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.PhotometricDistort(), + T.RandomSelect( + T.Compose([ + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]) + ), + normalize, + ]) + + # we do not use the 'val' set since the annotations are inaccessible + if image_set == 'val': + return T.Compose([ + T.RandomResize([360], max_size=640), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.ytvos_path) + assert root.exists(), f'provided YTVOS path {root} does not exist' + PATHS = { + "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"), + "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually + } + img_folder, ann_file = PATHS[image_set] + # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, + # num_frames=args.num_frames, max_skip=args.max_skip) + dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, + num_frames=args.num_frames, max_skip=args.max_skip) + return dataset + diff --git a/.history/mbench/gpt_ref-ytvos-cy_20250121151513.py b/.history/mbench/gpt_ref-ytvos-cy_20250121151513.py new file mode 100644 index 0000000000000000000000000000000000000000..aebdedbbef16585f90f8dcfd2c21d26c32440d69 --- /dev/null +++ b/.history/mbench/gpt_ref-ytvos-cy_20250121151513.py @@ -0,0 +1,433 @@ +import sys +from os import path as osp +sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..'))) + +from mbench.ytvos_ref import build as build_ytvos_ref +import argparse +import opts + +import sys +from pathlib import Path +import os +from os import path as osp +import skimage +from io import BytesIO + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +from openai import OpenAI +import base64 + +# Function to encode the image +def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + + +# Captioner +ytvos_category_valid_list = [ + 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', + 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', + 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', + 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', + 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake', + 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra' +] +def getCaption(video_id, json_data): + #데이터 가져오기 + video_data = json_data[video_id] + frame_names = video_data['frame_names'] + video_path = video_data['video_path'] + + cat_names = set() + all_captions = dict() + for obj_id in list(video_data['annotations'][0].keys()): + cat_names.add(video_data['annotations'][0][obj_id]['category_name']) + + # cat_names : person, snowboard + # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기 + # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다 + + for cat_name in list(cat_names) : + image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names] + image_captions = {} + + captioner = OpenAI() + + #0단계: action의 대상이 될 수 있는가? + is_movable = False + if cat_name in ytvos_category_valid_list : + is_movable = True + + # response_check = captioner.chat.completions.create( + # model="gpt-4o", + # messages=[ + # { + # "role": "user", + # "content": f""" + # Can a {cat_name} be a subject of distinct actions or movements? + # For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject. + # However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions. + # Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE. + # Answer only YES or NONE. + # """ + # } + # ], + # ) + # response_check_content = response_check.choices[0].message.content.strip().lower() + # print(f"Movable Check for {cat_name}: {response_check_content}") + + # if response_check_content == "yes": is_movable = True + + if not is_movable: + print(f"Skipping {cat_name}: Determined to be non-movable.") + continue + + for i in range(len(image_paths)): + image_path = image_paths[i] + frame_name = frame_names[i] + base64_image = encode_image(image_path) + + #1단계: 필터링 + print(cat_name, frame_name) + response1 = captioner.chat.completions.create( + model="gpt-4o", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + + "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions? + Focus only on clear and prominent actions, avoiding minor or ambiguous ones. + Each action should be unique and clearly associated with a specific object. + + Respond with YES if: + - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable. + - The {cat_name}s involve clear, distinguishable actions performed independently. + + Respond with NONE if: + - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person. + - Actions are ambiguous, minor, or not clearly visible. + + If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE. + If the {cat_name} is 'person' and their actions are distinct and clear, output YES. + + Answer only YES or NONE.""" + + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + response_content = response1.choices[0].message.content + should_caption = True if "yes" in response_content.lower() else False + print(f"are {cat_name}s distinguished by action: {response_content}") + + #2단계: dense caption 만들기 + if should_caption: + response2 = captioner.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + + "text": f""" + Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image. + 1. Focus only on clear, unique, and prominent actions that distinguish each object. + 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image. + 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions. + 4. Do not include common-sense or overly general descriptions like 'the elephant walks'. + 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements. + 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'. + 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'. + 8. Include interactions with objects or other entities when they are prominent and observable. + 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific. + Output only the caption.""", + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + + caption = response2.choices[0].message.content + print(f"{image_path} - {frame_name}: {caption}") + else: + caption = None + + image_captions[frame_name] = caption + all_captions[cat_name] = image_captions + + # final : also prepare valid object ids + valid_obj_ids = [] + valid_cat_names = list(all_captions.keys()) + for obj_id in list(video_data['annotations'][0].keys()): + cat = video_data['annotations'][0][obj_id]['category_name'] + if cat in valid_cat_names : valid_obj_ids.append(obj_id) + + return all_captions, valid_obj_ids + + +# Referring expression generator and QA filter +def getRefExp(video_id, frame_name, caption, obj_id, json_data): + # 이미지에 해당 물체 바운딩 박스 그리기 + video_data = json_data[video_id] + frame_names = video_data['frame_names'] + video_path = video_data['video_path'] + I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg')) + frame_indx = frame_names.index(frame_name) + obj_data = video_data['annotations'][frame_indx][obj_id] + + bbox = obj_data['bbox'] + cat_name = obj_data['category_name'] + valid = obj_data['valid'] + + if valid == 0: + print("Object not in this frame!") + return {} + + + x_min, y_min, x_max, y_max = bbox + x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max) + cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2) + plt.figure() + plt.imshow(I) + plt.axis('off') + plt.show() + + #cropped object for visibility check + cropped_I = I[y_min:y_max, x_min:x_max] + pil_cropped_I = Image.fromarray(cropped_I) + buff_crop = BytesIO() + pil_cropped_I.save(buff_crop, format='JPEG') + base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8") + + #entire image for referring expression generation + pil_I = Image.fromarray(I) + buff = BytesIO() + pil_I.save(buff, format='JPEG') + base64_I = base64.b64encode(buff.getvalue()).decode("utf-8") + + # 구분 가능 여부 확인 + generator = OpenAI() + response_check = generator.chat.completions.create( + model="chatgpt-4o-latest", + messages=[ + { + "role": "user", + "content": [ + { + + "type": "text", + "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}? + Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible. + + Guidelines: + - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES. + - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE. + - If the object is clearly visible and identifiable as a {cat_name}, respond with YES. + + Output only either YES or NONE. + """ + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"}, + } + ] + }, + ] + ) + + response_check_content = response_check.choices[0].message.content.strip().lower() + print(f"is object {obj_id} visible: {response_check_content}") + + if "yes" not in response_check_content: + print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.") + return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False} + + # Referring expression 만들기 + # generator = OpenAI() + response = generator.chat.completions.create( + model="chatgpt-4o-latest", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + + "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}. + Guidelines for creating the referring expression: + 1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}). + 2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s. + 3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}. + 4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}. + 5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities. + 6. Use '{cat_name}' as the noun for the referring expressions. + Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}). + + {caption} + """ + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, + }, + # { + # "type": "image_url", + # "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"}, + # } + ], + } + ], + ) + + ref_exp = response.choices[0].message.content.strip() + + #QA filtering + #QA1: 원하는 물체를 설명하는지 + filter = OpenAI() + response1 = filter.chat.completions.create( + model="gpt-4o", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO. + {ref_exp}""", + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, + }, + ], + } + ], + ) + + response1_content = response1.choices[0].message.content + describesHighlighted = True if "yes" in response1_content.lower() else False + + #QA2: 원하지 않는 물체를 설명하지 않는지 + response2 = filter.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO. + {ref_exp}""", + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, + }, + ], + } + ], + ) + + response2_content = response2.choices[0].message.content + describesNotHighlighted = True if "yes" in response2_content.lower() else False + + isValid = True if describesHighlighted and not describesNotHighlighted else False + + print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}") + + return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid} + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + args = parser.parse_args() + + #==================데이터 불러오기=================== + # # 전체 데이터셋 + # train_dataset = build_ytvos_ref(image_set = 'train', args = args) + + # # 전체 데이터셋 메타데이터 + # metas = train_dataset.metas + + with open('mbench/sampled_frame3.json', 'r') as file: + data = json.load(file) + + vid_ids = list(data.keys()) + + all_ref_exps = {} + + #==================GPT 돌리기=================== + os.environ['OPENAI_API_KEY'] = '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' + + # 전체 데이터셋의 vid_id에 대해 + for i in range(1): + vid_id = vid_ids[i] + + #====캡션 만들기==== + caption, valid_obj_ids = getCaption(vid_id, data) + cats_in_vid = list(caption.keys()) + + #====referring expression 만들고 QA filtering==== + ref_expressions = {} + # 각 카테고리별로 + for cat_name in cats_in_vid: + if cat_name not in ref_expressions: + ref_expressions[cat_name] = {} + + # 각 비디오 프레임 별로 + for frame_name in data[vid_id]['frame_names']: + + if frame_name not in ref_expressions[cat_name]: + ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary + + caption = caption[cat_name][frame_name] + + if not caption : continue + else : + # 각 obj id별로 + for obj_id in valid_obj_ids: + ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data) + ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp + + + all_ref_exps[vid_id] = ref_expressions + + with open('mbench/result-cy.json', 'w') as file: + json.dump(all_ref_exps, file) diff --git a/.history/mbench/gpt_ref-ytvos-revised_20250121160858.py b/.history/mbench/gpt_ref-ytvos-revised_20250121160858.py new file mode 100644 index 0000000000000000000000000000000000000000..3f0ae7a62d1ba1c8a439fef1486935d8eb184a76 --- /dev/null +++ b/.history/mbench/gpt_ref-ytvos-revised_20250121160858.py @@ -0,0 +1,428 @@ +import sys +from os import path as osp +sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..'))) + +from mbench.ytvos_ref import build as build_ytvos_ref +import argparse +import opts + +import sys +from pathlib import Path +import os +from os import path as osp +import skimage +from io import BytesIO + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +from openai import OpenAI +import base64 + +# Function to encode the image +def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + +# Captioner +ytvos_category_valid_list = [ + 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', + 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', + 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', + 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', + 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake', + 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra' +] +def getCaption(video_id, json_data): + #데이터 가져오기 + video_data = json_data[video_id] + frame_names = video_data['frame_names'] + video_path = video_data['video_path'] + + cat_names = set() + all_captions = dict() + for obj_id in list(video_data['annotations'][0].keys()): + cat_names.add(video_data['annotations'][0][obj_id]['category_name']) + + # cat_names : person, snowboard + # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기 + # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다 + + for cat_name in list(cat_names) : + image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names] + image_captions = {} + + captioner = OpenAI() + + #0단계: action의 대상이 될 수 있는가? + is_movable = False + if cat_name in ytvos_category_valid_list : + is_movable = True + + # response_check = captioner.chat.completions.create( + # model="gpt-4o", + # messages=[ + # { + # "role": "user", + # "content": f""" + # Can a {cat_name} be a subject of distinct actions or movements? + # For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject. + # However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions. + # Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE. + # Answer only YES or NONE. + # """ + # } + # ], + # ) + # response_check_content = response_check.choices[0].message.content.strip().lower() + # print(f"Movable Check for {cat_name}: {response_check_content}") + + # if response_check_content == "yes": is_movable = True + + if not is_movable: + print(f"Skipping {cat_name}: Determined to be non-movable.") + continue + + for i in range(len(image_paths)): + image_path = image_paths[i] + frame_name = frame_names[i] + base64_image = encode_image(image_path) + + #1단계: 필터링 + #print(f"-----------category name: {cat_name}, frame name: {frame_name}") + response1 = captioner.chat.completions.create( + model="chatgpt-4o-latest", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + + "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions? + Focus only on clear and prominent actions, avoiding minor or ambiguous ones. + Each action should be unique and clearly associated with a specific object. + + Respond with YES if: + - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable. + - The {cat_name}s involve clear, distinguishable actions performed independently. + + Respond with NONE if: + - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person. + - Actions are ambiguous, minor, or not clearly visible. + + If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE. + If the {cat_name} is 'person' and their actions are distinct and clear, output YES. + + Answer only YES or NONE.""" + + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + response_content = response1.choices[0].message.content + should_caption = True if "yes" in response_content.lower() else False + #print(f"are {cat_name}s distinguished by action: {response_content}") + + #2단계: dense caption 만들기 + if should_caption: + response2 = captioner.chat.completions.create( + model="chatgpt-4o-latest", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + + "text": f""" + Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image. + 1. Focus only on clear, unique, and prominent actions that distinguish each object. + 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image. + 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions. + 4. Do not include common-sense or overly general descriptions like 'the elephant walks'. + 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements. + 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'. + 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'. + 8. Include interactions with objects or other entities when they are prominent and observable. + 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific. + Output only the caption.""", + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + + caption = response2.choices[0].message.content + #print(f"{image_path} - {frame_name}: {caption}") + else: + caption = None + + image_captions[frame_name] = caption + all_captions[cat_name] = image_captions + + # final : also prepare valid object ids + valid_obj_ids = [] + valid_cat_names = list(all_captions.keys()) + for obj_id in list(video_data['annotations'][0].keys()): + cat = video_data['annotations'][0][obj_id]['category_name'] + if cat in valid_cat_names : valid_obj_ids.append(obj_id) + + return all_captions, valid_obj_ids + +# Referring expression generator and QA filter +def getRefExp(video_id, frame_name, caption, obj_id, json_data): + + # 이미지에 해당 물체 바운딩 박스 그리기 + video_data = json_data[video_id] + frame_names = video_data['frame_names'] + video_path = video_data['video_path'] + I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg')) + frame_indx = frame_names.index(frame_name) + obj_data = video_data['annotations'][frame_indx][obj_id] + + bbox = obj_data['bbox'] + cat_name = obj_data['category_name'] + valid = obj_data['valid'] + + if valid == 0: + print("Object not in this frame!") + return {} + + + x_min, y_min, x_max, y_max = bbox + x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max) + cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2) + plt.figure() + plt.imshow(I) + plt.axis('off') + plt.show() + + #cropped object for visibility check + cropped_I = I[y_min:y_max, x_min:x_max] + pil_cropped_I = Image.fromarray(cropped_I) + buff_crop = BytesIO() + pil_cropped_I.save(buff_crop, format='JPEG') + base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8") + + #entire image for referring expression generation + pil_I = Image.fromarray(I) + buff = BytesIO() + pil_I.save(buff, format='JPEG') + base64_I = base64.b64encode(buff.getvalue()).decode("utf-8") + + # 구분 가능 여부 확인 + generator = OpenAI() + response_check = generator.chat.completions.create( + model="chatgpt-4o-latest", + messages=[ + { + "role": "user", + "content": [ + { + + "type": "text", + "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}? + Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible. + + Guidelines: + - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES. + - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE. + - If the object is clearly visible and identifiable as a {cat_name}, respond with YES. + + Output only either YES or NONE. + """ + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"}, + } + ] + }, + ] + ) + + response_check_content = response_check.choices[0].message.content.strip().lower() + #print(f"is object {obj_id} visible: {response_check_content}") + + if "yes" not in response_check_content: + print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.") + return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False} + + # Referring expression 만들기 + # generator = OpenAI() + response = generator.chat.completions.create( + model="chatgpt-4o-latest", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + + "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}. + Guidelines for creating the referring expression: + 1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}). + 2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s. + 3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}. + 4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}. + 5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities. + 6. Use '{cat_name}' as the noun for the referring expressions. + Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}). + + {caption} + """ + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, + }, + # { + # "type": "image_url", + # "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"}, + # } + ], + } + ], + ) + + ref_exp = response.choices[0].message.content.strip() + + #QA filtering + #QA1: 원하는 물체를 설명하는지 + filter = OpenAI() + response1 = filter.chat.completions.create( + model="chatgpt-4o-latest", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO. + {ref_exp}""", + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, + }, + ], + } + ], + ) + + response1_content = response1.choices[0].message.content + describesHighlighted = True if "yes" in response1_content.lower() else False + + #QA2: 원하지 않는 물체를 설명하지 않는지 + response2 = filter.chat.completions.create( + model="chatgpt-4o-latest", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO. + {ref_exp}""", + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, + }, + ], + } + ], + ) + + response2_content = response2.choices[0].message.content + notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True + + isValid = True if describesHighlighted and notDescribesNotHighlighted else False + + #print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}") + #print(f"ref exp: {ref_exp}") + #print("") + + return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid} + + +if __name__ == '__main__': + with open('mbench/sampled_frame3.json', 'r') as file: + data = json.load(file) + + vid_ids = list(data.keys()) + all_ref_exps = {} + + os.environ['OPENAI_API_KEY'] = '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' + + # 전체 데이터셋의 vid_id에 대해 + for i in range(50): + vid_id = vid_ids[i] + + #====캡션 만들기==== + # print("=====================captioner========================") + captions, valid_obj_ids = getCaption(vid_id, data) + cats_in_vid = list(captions.keys()) + # print() + + #====referring expression 만들고 QA filtering==== + # print("=====================referring expression generator & QA filter========================") + ref_expressions = {} + + # 각 카테고리별로 + for cat_name in cats_in_vid: + if cat_name not in ref_expressions: + ref_expressions[cat_name] = {} + # 각 비디오 프레임 별로 + for frame_name in data[vid_id]['frame_names']: + # print(f'--------category: {cat_name}, frame_name: {frame_name}') + + if frame_name not in ref_expressions[cat_name]: + ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary + caption = captions[cat_name][frame_name] + if not caption : continue + else : + # 각 obj id별로 + for obj_id in valid_obj_ids: + ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data) + ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp + + all_ref_exps[vid_id] = ref_expressions + + + with open('mbench/result_revised50.json', 'w') as file: + json.dump(all_ref_exps, file, indent=4) + + + + + diff --git a/.history/mbench/gpt_ref-ytvos_20250119070820.py b/.history/mbench/gpt_ref-ytvos_20250119070820.py new file mode 100644 index 0000000000000000000000000000000000000000..621627209495dc3ef794fc1e1c086f29f21f0c56 --- /dev/null +++ b/.history/mbench/gpt_ref-ytvos_20250119070820.py @@ -0,0 +1,286 @@ +from datasets import build_dataset +import argparse +import opts + +import sys +from pathlib import Path +import os +from os import path as osp +import skimage +from io import BytesIO + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +from openai import OpenAI +import base64 + +os.environ['OPENAI_API_KEY'] = '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' + +# Function to encode the image +def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + +def getCaption(video_id, json_data): + #데이터 가져오기 + video_data = json_data[video_id] + frame_names = video_data['frame_names'] + video_path = video_data['video_path'] + + cat_names = set() + for obj_id in list(video_data['annotations'][0].keys()): + cat_names.add(video_data['annotations'][0][obj_id]['category_name']) + + if len(cat_names) == 1: + cat_name = next(iter(cat_names)) + else: + print("more than 2 categories") + return -1 + + image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names] + image_captions = {} + + captioner = OpenAI() + for i in range(len(image_paths)): + image_path = image_paths[i] + frame_name = frame_names[i] + base64_image = encode_image(image_path) + + #1단계: 필터링 + response1 = captioner.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None", + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + response_content = response1.choices[0].message.content + should_caption = True if "yes" in response_content.lower() else False + + #2단계: dense caption 만들기 + if should_caption: + response2 = captioner.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": f""" + Describe the image in detail focusing on the {cat_name}s' actions. + 1. Each action should be prominent, clear and unique, describing the corresponding object only. + 2. Avoid overly detailed or indeterminate details such as ‘in anticipation’. + 3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting. + 4. Do not include actions that needs to be guessed or suggested.""", + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + + caption = response2.choices[0].message.content + else: + caption = None + + image_captions[frame_name] = caption + return image_captions + +def getRefExp(video_id, frame_name, caption, obj_id, json_data): + # 이미지에 해당 물체 바운딩 박스 그리기 + video_data = json_data[video_id] + frame_names = video_data['frame_names'] + video_path = video_data['video_path'] + I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg')) + frame_indx = frame_names.index(frame_name) + obj_data = video_data['annotations'][frame_indx][obj_id] + + bbox = obj_data['bbox'] + cat_name = obj_data['category_name'] + valid = obj_data['valid'] + + if valid == 0: + print("Object not in this frame!") + return {} + + + x_min, y_min, x_max, y_max = bbox + x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max) + cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2) + plt.figure() + plt.imshow(I) + plt.axis('off') + plt.show() + pil_I = Image.fromarray(I) + buff = BytesIO() + pil_I.save(buff, format='JPEG') + base64_I = base64.b64encode(buff.getvalue()).decode("utf-8") + + #ref expression 만들기 + generator = OpenAI() + response = generator.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box. + 1. The referring expression describes the action and does not contain information about appearance or location in the picture. + 2. Focus only on prominent actions and avoid overly detailed or indeterminate details. + 3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words. + 4. The referring expression should only describe the highlighted {cat_name} and not any other. + 5. Use '{cat_name}' as the noun for the referring expressions. + Output only the referring expression. + {caption}""", + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, + }, + ], + } + ], + ) + + ref_exp = response.choices[0].message.content + + #QA filtering + #QA1: 원하는 물체를 설명하는지 + filter = OpenAI() + response1 = filter.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO. + {ref_exp}""", + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, + }, + ], + } + ], + ) + + response1_content = response1.choices[0].message.content + describesHighlighted = True if "yes" in response1_content.lower() else False + + #QA2: 원하지 않는 물체를 설명하지 않는지 + response2 = filter.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO. + {ref_exp}""", + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, + }, + ], + } + ], + ) + + response2_content = response2.choices[0].message.content + describesNotHighlighted = True if "yes" in response2_content.lower() else False + + isValid = True if describesHighlighted and not describesNotHighlighted else False + + print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}") + + return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid} + +def createRefExp(video_id, json_data): + video_data = json_data[video_id] + obj_ids = list(video_data['annotations'][0].keys()) + frame_names = video_data['frame_names'] + + captions_per_frame = getCaption(video_id, json_data) + + if captions_per_frame == -1: + print("There are more than 2 cateories") + return + + + video_ref_exps = {} + + for frame_name in frame_names: + frame_caption = captions_per_frame[frame_name] + + if frame_caption == None: + video_ref_exps[frame_name] = None + + else: + frame_ref_exps = {} + for obj_id in obj_ids: + exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data) + frame_ref_exps[obj_id] = exp_per_obj + video_ref_exps[frame_name] = frame_ref_exps + + return video_ref_exps + +if __name__ == '__main__': + with open('mbench/sampled_frame3.json', 'r') as file: + data = json.load(file) + + videos = set() + with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file: + manual_select = list(file) + for frame in manual_select: + result = json.loads(frame) + videos.add(result['video']) + videos = list(videos) + + + all_video_refs = {} + for i in range(10): + video_id = videos[i] + video_ref = createRefExp(video_id, data) + all_video_refs[video_id] = video_ref \ No newline at end of file diff --git a/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130183936.py b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130183936.py new file mode 100644 index 0000000000000000000000000000000000000000..de6149e9bcaafadd04aea9b75a7a3aaf171393ee --- /dev/null +++ b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130183936.py @@ -0,0 +1,199 @@ +import os +import sys +from os import path as osp +from io import BytesIO + +from mbench.ytvos_ref import build as build_ytvos_ref +import argparse +import opts + +import sys +from pathlib import Path +import os +from os import path as osp +import skimage +from io import BytesIO + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle +import textwrap + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +from openai import OpenAI +import base64 + +def number_objects_and_encode(idx, color_mask=False): + encoded_frames = {} + contoured_frames = {} # New dictionary for original images + vid_cat_cnts = {} + + vid_meta = metas[idx] + vid_data = train_dataset[idx] + vid_id = vid_meta['video'] + frame_indx = vid_meta['sample_indx'] + cat_names = set(vid_meta['obj_id_cat'].values()) + imgs = vid_data[0] + + for cat in cat_names: + cat_frames = [] + contour_frames = [] + frame_cat_cnts = {} + + for i in range(imgs.size(0)): + frame_name = frame_indx[i] + frame = np.copy(imgs[i].permute(1, 2, 0).numpy()) + frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) + + frame_data = vid_data[2][frame_name] + obj_ids = list(frame_data.keys()) + + cat_cnt = 0 + + for j in range(len(obj_ids)): + obj_id = obj_ids[j] + obj_data = frame_data[obj_id] + obj_bbox = obj_data['bbox'] + obj_valid = obj_data['valid'] + obj_mask = obj_data['mask'].numpy().astype(np.uint8) + obj_cat = obj_data['category_name'] + + if obj_cat == cat and obj_valid: + cat_cnt += 1 + + if color_mask == False: + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 3) + for i, contour in enumerate(contours): + # 윤곽선 중심 계산 + moments = cv2.moments(contour) + if moments["m00"] != 0: # 중심 계산 가능 여부 확인 + cx = int(moments["m10"] / moments["m00"]) + cy = int(moments["m01"] / moments["m00"]) + else: + cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용 + + # 텍스트 배경 (검은색 배경 만들기) + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + text_size = cv2.getTextSize(text, font, 1, 2)[0] + text_w, text_h = text_size + + # 텍스트 배경 그리기 (검은색 배경) + cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5), + (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1) + + # 텍스트 그리기 (흰색 텍스트) + cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2), + font, 1, (255, 255, 255), 2) + + else: + alpha = 0.08 + + colored_obj_mask = np.zeros_like(frame) + colored_obj_mask[obj_mask == 1] = colors[j] + frame[obj_mask == 1] = ( + (1 - alpha) * frame[obj_mask == 1] + + alpha * colored_obj_mask[obj_mask == 1] + ) + + + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 2) + cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2) + + + + if len(contours) > 0: + largest_contour = max(contours, key=cv2.contourArea) + M = cv2.moments(largest_contour) + if M["m00"] != 0: + center_x = int(M["m10"] / M["m00"]) + center_y = int(M["m01"] / M["m00"]) + else: + center_x, center_y = 0, 0 + + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + + font_scale = 0.9 + text_size = cv2.getTextSize(text, font, font_scale, 2)[0] + text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심 + text_y = center_y + # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심 + + # 텍스트 배경 사각형 좌표 계산 + rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단 + # rect_end = (text_x + text_size[0] + 5, text_y + 5) + rect_end = (text_x + text_size[0] + 5, text_y) + + cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) + cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2) + + # plt.figure(figsize=(12, 8)) + # plt.imshow(frame) + # plt.title(f"frame {frame_name}") + # plt.tight_layout() + # plt.axis('off') + # plt.show() + + buffer = BytesIO() + frame = Image.fromarray(frame) + frame.save(buffer, format='jpeg') + buffer.seek(0) + cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + frame_cat_cnts[frame_name] = cat_cnt + + buffer.seek(0) # Reuse buffer instead of creating a new one + buffer.truncate() + frame_for_contour = Image.fromarray(frame_for_contour) + frame_for_contour.save(buffer, format='jpeg') + buffer.seek(0) + contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + + encoded_frames[cat] = cat_frames + contoured_frames[cat] = contour_frames + vid_cat_cnts[cat] = frame_cat_cnts + + return encoded_frames, vid_cat_cnts, contoured_frames + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + args = parser.parse_args() + + #==================데이터 불러오기=================== + # 전체 데이터셋 + train_dataset = build_ytvos_ref(image_set = 'train', args = args) + + # 전체 데이터셋 메타데이터 + metas = train_dataset.metas + + # 색상 후보 8개 (RGB 형식) + colors = [ + (255, 0, 0), # Red + (0, 255, 0), # Green + (0, 0, 255), # Blue + (255, 255, 0), # Yellow + (255, 0, 255), # Magenta + (0, 255, 255), # Cyan + (128, 0, 128), # Purple + (255, 165, 0) # Orange + ] + diff --git a/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190533.py b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190533.py new file mode 100644 index 0000000000000000000000000000000000000000..957a573b4639bcd04b47456a28cb173c6b978650 --- /dev/null +++ b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190533.py @@ -0,0 +1,429 @@ +import os + +import sys +from os import path as osp +from io import BytesIO + +from ytvos_ref import build as build_ytvos_ref +import argparse +import opts + +import sys +from pathlib import Path +import os +from os import path as osp +import skimage +from io import BytesIO + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle +import textwrap + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +from openai import OpenAI +import base64 +import json + +def number_objects_and_encode(idx, color_mask=False): + encoded_frames = {} + contoured_frames = {} # New dictionary for original images + vid_cat_cnts = {} + + vid_meta = metas[idx] + vid_data = train_dataset[idx] + vid_id = vid_meta['video'] + frame_indx = vid_meta['sample_indx'] + cat_names = set(vid_meta['obj_id_cat'].values()) + imgs = vid_data[0] + + for cat in cat_names: + cat_frames = [] + contour_frames = [] + frame_cat_cnts = {} + + for i in range(imgs.size(0)): + frame_name = frame_indx[i] + frame = np.copy(imgs[i].permute(1, 2, 0).numpy()) + frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) + + frame_data = vid_data[2][frame_name] + obj_ids = list(frame_data.keys()) + + cat_cnt = 0 + + for j in range(len(obj_ids)): + obj_id = obj_ids[j] + obj_data = frame_data[obj_id] + obj_bbox = obj_data['bbox'] + obj_valid = obj_data['valid'] + obj_mask = obj_data['mask'].numpy().astype(np.uint8) + obj_cat = obj_data['category_name'] + + if obj_cat == cat and obj_valid: + cat_cnt += 1 + + if color_mask == False: + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 3) + for i, contour in enumerate(contours): + # 윤곽선 중심 계산 + moments = cv2.moments(contour) + if moments["m00"] != 0: # 중심 계산 가능 여부 확인 + cx = int(moments["m10"] / moments["m00"]) + cy = int(moments["m01"] / moments["m00"]) + else: + cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용 + + # 텍스트 배경 (검은색 배경 만들기) + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + text_size = cv2.getTextSize(text, font, 1, 2)[0] + text_w, text_h = text_size + + # 텍스트 배경 그리기 (검은색 배경) + cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5), + (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1) + + # 텍스트 그리기 (흰색 텍스트) + cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2), + font, 1, (255, 255, 255), 2) + + else: + alpha = 0.08 + + colored_obj_mask = np.zeros_like(frame) + colored_obj_mask[obj_mask == 1] = colors[j] + frame[obj_mask == 1] = ( + (1 - alpha) * frame[obj_mask == 1] + + alpha * colored_obj_mask[obj_mask == 1] + ) + + + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 2) + cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2) + + + + if len(contours) > 0: + largest_contour = max(contours, key=cv2.contourArea) + M = cv2.moments(largest_contour) + if M["m00"] != 0: + center_x = int(M["m10"] / M["m00"]) + center_y = int(M["m01"] / M["m00"]) + else: + center_x, center_y = 0, 0 + + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + + font_scale = 0.9 + text_size = cv2.getTextSize(text, font, font_scale, 2)[0] + text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심 + text_y = center_y + # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심 + + # 텍스트 배경 사각형 좌표 계산 + rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단 + # rect_end = (text_x + text_size[0] + 5, text_y + 5) + rect_end = (text_x + text_size[0] + 5, text_y) + + cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) + cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2) + + # plt.figure(figsize=(12, 8)) + # plt.imshow(frame) + # plt.title(f"frame {frame_name}") + # plt.tight_layout() + # plt.axis('off') + # plt.show() + + buffer = BytesIO() + frame = Image.fromarray(frame) + frame.save(buffer, format='jpeg') + buffer.seek(0) + cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + frame_cat_cnts[frame_name] = cat_cnt + + buffer.seek(0) # Reuse buffer instead of creating a new one + buffer.truncate() + frame_for_contour = Image.fromarray(frame_for_contour) + frame_for_contour.save(buffer, format='jpeg') + buffer.seek(0) + contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + + encoded_frames[cat] = cat_frames + contoured_frames[cat] = contour_frames + vid_cat_cnts[cat] = frame_cat_cnts + + return encoded_frames, vid_cat_cnts, contoured_frames + + +def getCaption(idx, color_mask=True): + vid_meta = metas[idx] + vid_data = train_dataset[idx] + vid_id = vid_meta['video'] + print(f"vid id: {vid_id}\n") + + frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16] + cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...} + all_captions = dict() + + base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask) + marked = "mask with boundary" if color_mask else "boundary" + + for cat_name in list(cat_names) : + + is_movable = False + if cat_name in ytvos_category_valid_list : + is_movable = True + + if not is_movable: + print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n') + + + image_captions = {} + captioner = OpenAI() + cat_base64_frames = base64_frames[cat_name] + cont_base64_frames = contoured_frames[cat_name] + + for i in range(len(cat_base64_frames)): + frame_name = frame_indx[i] + cont_base64_image = cont_base64_frames[i] + base64_image = cat_base64_frames[i] + should_filter = False + frame_cat_cnts = vid_cat_cnts[cat_name][frame_name] + + if frame_cat_cnts >= 2: + should_filter = True + else: + print(f"Skipping {cat_name}: There is single or no object.", end='\n\n') + + if is_movable and should_filter: + #1단계: 필터링 + print(f"-----------category name: {cat_name}, frame name: {frame_name}") + caption_filter_text = f""" + You are a visual assistant analyzing a single frame from a video. + In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker. + + Are {cat_name}s in the image performing all different and recognizable actions or postures? + Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position), + facial expressions, and any notable interactions with objects or other {cat_name}s or people. + + Only focus on obvious, prominent actions that can be reliably identified from this single frame. + + - Respond with "YES" if: + 1) Most of {cat_name}s exhibit clearly different, unique actions or poses. + 2) You can see visible significant differences in action and posture, that an observer can identify at a glance. + 3) Each action is unambiguously recognizable and distinct. + + - Respond with "NONE" if: + 1) The actions or pose are not clearly differentiable or too similar. + 2) They show no noticeable action beyond standing or minor movements. + + Answer strictly with either "YES" or "NONE". + """ + + + response1 = captioner.chat.completions.create( + model="chatgpt-4o-latest", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": caption_filter_text, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + } + ], + } + ], + ) + response_content = response1.choices[0].message.content + should_caption = True if "yes" in response_content.lower() else False + print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n') + + else: + should_caption = False + + #2단계: dense caption 만들기 + dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object. + In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. + I want to use your expressions to create a action-centric referring expression dataset. + Therefore, your expressions for these {cat_name}s should describe unique action of each object. + + 1. Focus only on clear, unique, and prominent actions that distinguish each object. + 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image. + 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions. + 4. Do not include common-sense or overly general descriptions like 'the elephant walks'. + 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements. + 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'. + 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'. + 8. Include interactions with objects or other entities when they are prominent and observable. + 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific. + 10. Do not include descriptions of appearance such as clothes, color, size, shape etc. + 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous. + 12. Do not mention object IDs. + 13. Use '{cat_name}' as the noun for the referring expressions. + + Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). + Output referring expressions for each object id. + """ + + dense_caption_prompt = f""" + You are a visual assistant analyzing a single frame of a video. + In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. + I want to use your expressions to create a action-centric referring expression dataset. + Please describe each {cat_name} using **clearly observable** and **specific** actions. + + ## Guidelines: + 1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object). + 2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw). + 3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”). + 4. Do not use vague expressions like "interacting with something"** or "engaging with another object." + Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button"). + 5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction. + 6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions. + 7. Base your description on the following action definitions: + - Facial with object manipulation + - General body movement, body position or pattern + - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object"). + - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone"). + + ## Output Format: + - For each labeled {cat_name}, output one line in the format: + ID. action-oriented description + + Example: + 1. a bear grasping the edge of a wood with its front paws + 2. the bear pushing another bear, leaning forward + + **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”). + **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines. + Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). + For each labeled {cat_name}, output referring expressions for each object id. + """ + if should_caption: + response2 = captioner.chat.completions.create( + model="chatgpt-4o-latest", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": dense_caption_prompt, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + + caption = response2.choices[0].message.content + #print(f"{image_path} - {frame_name}: {caption}") + else: + caption = None + + image_captions[frame_name] = caption + all_captions[cat_name] = image_captions + + # final : also prepare valid object ids + valid_obj_ids = dict() + + for cat in cat_names: + if cat in ytvos_category_valid_list: + obj_id_cat = vid_meta['obj_id_cat'] + valid_cat_ids = [] + for obj_id in list(obj_id_cat.keys()): + if obj_id_cat[obj_id] == cat: + valid_cat_ids.append(obj_id) + valid_obj_ids[cat] = valid_cat_ids + + return vid_id, all_captions, valid_obj_ids + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json") + parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json") + + args = parser.parse_args() + + print(args.save_caption_path, flush=True) + print(args.save_valid_obj_ids_path, flush=True) + + #==================데이터 불러오기=================== + # 전체 데이터셋 + train_dataset = build_ytvos_ref(image_set = 'train', args = args) + + # 전체 데이터셋 메타데이터 + metas = train_dataset.metas + + # 색상 후보 8개 (RGB 형식) + colors = [ + (255, 0, 0), # Red + (0, 255, 0), # Green + (0, 0, 255), # Blue + (255, 255, 0), # Yellow + (255, 0, 255), # Magenta + (0, 255, 255), # Cyan + (128, 0, 128), # Purple + (255, 165, 0) # Orange + ] + + ytvos_category_valid_list = [ + 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', + 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', + 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', + 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', + 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake', + 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra' + ] + + #==================gpt 돌리기=================== + os.environ['OPENAI_API_KEY'] = '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' + + result_captions = {} + result_valid_obj_ids = {} + + for i in range(370): + vid_id, all_captions, valid_obj_ids = getCaption(i, True) + + if vid_id not in result_captions: + result_captions[vid_id] = all_captions + if vid_id not in result_valid_obj_ids: + result_valid_obj_ids[vid_id] = valid_obj_ids + + print("Finished!", flush=True) + + with open(args.save_caption_path, "w") as file: + json.dump(result_captions, file, indent=4) + + with open(args.save_valid_obj_ids_path, "w") as file: + json.dump(result_valid_obj_ids, file, indent=4) diff --git a/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190813.py b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190813.py new file mode 100644 index 0000000000000000000000000000000000000000..0b07f482fc6ac58f78b690db64f24454930fef25 --- /dev/null +++ b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190813.py @@ -0,0 +1,427 @@ +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from os import path as osp +from io import BytesIO + +from mbench.ytvos_ref import build as build_ytvos_ref +import argparse +import opts + +import sys +from pathlib import Path +import os +from os import path as osp +import skimage +from io import BytesIO + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle +import textwrap + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +from openai import OpenAI +import base64 +import json + +def number_objects_and_encode(idx, color_mask=False): + encoded_frames = {} + contoured_frames = {} # New dictionary for original images + vid_cat_cnts = {} + + vid_meta = metas[idx] + vid_data = train_dataset[idx] + vid_id = vid_meta['video'] + frame_indx = vid_meta['sample_indx'] + cat_names = set(vid_meta['obj_id_cat'].values()) + imgs = vid_data[0] + + for cat in cat_names: + cat_frames = [] + contour_frames = [] + frame_cat_cnts = {} + + for i in range(imgs.size(0)): + frame_name = frame_indx[i] + frame = np.copy(imgs[i].permute(1, 2, 0).numpy()) + frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) + + frame_data = vid_data[2][frame_name] + obj_ids = list(frame_data.keys()) + + cat_cnt = 0 + + for j in range(len(obj_ids)): + obj_id = obj_ids[j] + obj_data = frame_data[obj_id] + obj_bbox = obj_data['bbox'] + obj_valid = obj_data['valid'] + obj_mask = obj_data['mask'].numpy().astype(np.uint8) + obj_cat = obj_data['category_name'] + + if obj_cat == cat and obj_valid: + cat_cnt += 1 + + if color_mask == False: + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 3) + for i, contour in enumerate(contours): + # 윤곽선 중심 계산 + moments = cv2.moments(contour) + if moments["m00"] != 0: # 중심 계산 가능 여부 확인 + cx = int(moments["m10"] / moments["m00"]) + cy = int(moments["m01"] / moments["m00"]) + else: + cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용 + + # 텍스트 배경 (검은색 배경 만들기) + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + text_size = cv2.getTextSize(text, font, 1, 2)[0] + text_w, text_h = text_size + + # 텍스트 배경 그리기 (검은색 배경) + cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5), + (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1) + + # 텍스트 그리기 (흰색 텍스트) + cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2), + font, 1, (255, 255, 255), 2) + + else: + alpha = 0.08 + + colored_obj_mask = np.zeros_like(frame) + colored_obj_mask[obj_mask == 1] = colors[j] + frame[obj_mask == 1] = ( + (1 - alpha) * frame[obj_mask == 1] + + alpha * colored_obj_mask[obj_mask == 1] + ) + + + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 2) + cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2) + + + + if len(contours) > 0: + largest_contour = max(contours, key=cv2.contourArea) + M = cv2.moments(largest_contour) + if M["m00"] != 0: + center_x = int(M["m10"] / M["m00"]) + center_y = int(M["m01"] / M["m00"]) + else: + center_x, center_y = 0, 0 + + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + + font_scale = 0.9 + text_size = cv2.getTextSize(text, font, font_scale, 2)[0] + text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심 + text_y = center_y + # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심 + + # 텍스트 배경 사각형 좌표 계산 + rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단 + # rect_end = (text_x + text_size[0] + 5, text_y + 5) + rect_end = (text_x + text_size[0] + 5, text_y) + + cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) + cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2) + + # plt.figure(figsize=(12, 8)) + # plt.imshow(frame) + # plt.title(f"frame {frame_name}") + # plt.tight_layout() + # plt.axis('off') + # plt.show() + + buffer = BytesIO() + frame = Image.fromarray(frame) + frame.save(buffer, format='jpeg') + buffer.seek(0) + cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + frame_cat_cnts[frame_name] = cat_cnt + + buffer.seek(0) # Reuse buffer instead of creating a new one + buffer.truncate() + frame_for_contour = Image.fromarray(frame_for_contour) + frame_for_contour.save(buffer, format='jpeg') + buffer.seek(0) + contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + + encoded_frames[cat] = cat_frames + contoured_frames[cat] = contour_frames + vid_cat_cnts[cat] = frame_cat_cnts + + return encoded_frames, vid_cat_cnts, contoured_frames + + +def getCaption(idx, color_mask=True): + vid_meta = metas[idx] + vid_data = train_dataset[idx] + vid_id = vid_meta['video'] + print(f"vid id: {vid_id}\n") + + frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16] + cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...} + all_captions = dict() + + base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask) + marked = "mask with boundary" if color_mask else "boundary" + + for cat_name in list(cat_names) : + + is_movable = False + if cat_name in ytvos_category_valid_list : + is_movable = True + + if not is_movable: + print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n') + + + image_captions = {} + captioner = OpenAI() + cat_base64_frames = base64_frames[cat_name] + cont_base64_frames = contoured_frames[cat_name] + + for i in range(len(cat_base64_frames)): + frame_name = frame_indx[i] + cont_base64_image = cont_base64_frames[i] + base64_image = cat_base64_frames[i] + should_filter = False + frame_cat_cnts = vid_cat_cnts[cat_name][frame_name] + + if frame_cat_cnts >= 2: + should_filter = True + else: + print(f"Skipping {cat_name}: There is single or no object.", end='\n\n') + + if is_movable and should_filter: + #1단계: 필터링 + print(f"-----------category name: {cat_name}, frame name: {frame_name}") + caption_filter_text = f""" + You are a visual assistant analyzing a single frame from a video. + In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker. + + Are {cat_name}s in the image performing all different and recognizable actions or postures? + Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position), + facial expressions, and any notable interactions with objects or other {cat_name}s or people. + + Only focus on obvious, prominent actions that can be reliably identified from this single frame. + + - Respond with "YES" if: + 1) Most of {cat_name}s exhibit clearly different, unique actions or poses. + 2) You can see visible significant differences in action and posture, that an observer can identify at a glance. + 3) Each action is unambiguously recognizable and distinct. + + - Respond with "NONE" if: + 1) The actions or pose are not clearly differentiable or too similar. + 2) They show no noticeable action beyond standing or minor movements. + + Answer strictly with either "YES" or "NONE". + """ + + + response1 = captioner.chat.completions.create( + model="chatgpt-4o-latest", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": caption_filter_text, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + } + ], + } + ], + ) + response_content = response1.choices[0].message.content + should_caption = True if "yes" in response_content.lower() else False + print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n') + + else: + should_caption = False + + #2단계: dense caption 만들기 + dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object. + In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. + I want to use your expressions to create a action-centric referring expression dataset. + Therefore, your expressions for these {cat_name}s should describe unique action of each object. + + 1. Focus only on clear, unique, and prominent actions that distinguish each object. + 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image. + 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions. + 4. Do not include common-sense or overly general descriptions like 'the elephant walks'. + 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements. + 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'. + 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'. + 8. Include interactions with objects or other entities when they are prominent and observable. + 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific. + 10. Do not include descriptions of appearance such as clothes, color, size, shape etc. + 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous. + 12. Do not mention object IDs. + 13. Use '{cat_name}' as the noun for the referring expressions. + + Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). + Output referring expressions for each object id. + """ + + dense_caption_prompt = f""" + You are a visual assistant analyzing a single frame of a video. + In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. + I want to use your expressions to create a action-centric referring expression dataset. + Please describe each {cat_name} using **clearly observable** and **specific** actions. + + ## Guidelines: + 1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object). + 2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw). + 3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”). + 4. Do not use vague expressions like "interacting with something"** or "engaging with another object." + Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button"). + 5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction. + 6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions. + 7. Base your description on the following action definitions: + - Facial with object manipulation + - General body movement, body position or pattern + - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object"). + - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone"). + + ## Output Format: + - For each labeled {cat_name}, output one line in the format: + ID. action-oriented description + + Example: + 1. a bear grasping the edge of a wood with its front paws + 2. the bear pushing another bear, leaning forward + + **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”). + **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines. + Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). + For each labeled {cat_name}, output referring expressions for each object id. + """ + if should_caption: + response2 = captioner.chat.completions.create( + model="chatgpt-4o-latest", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": dense_caption_prompt, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + + caption = response2.choices[0].message.content + #print(f"{image_path} - {frame_name}: {caption}") + else: + caption = None + + image_captions[frame_name] = caption + all_captions[cat_name] = image_captions + + # final : also prepare valid object ids + valid_obj_ids = dict() + + for cat in cat_names: + if cat in ytvos_category_valid_list: + obj_id_cat = vid_meta['obj_id_cat'] + valid_cat_ids = [] + for obj_id in list(obj_id_cat.keys()): + if obj_id_cat[obj_id] == cat: + valid_cat_ids.append(obj_id) + valid_obj_ids[cat] = valid_cat_ids + + return vid_id, all_captions, valid_obj_ids + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json") + parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json") + + args = parser.parse_args() + + #==================데이터 불러오기=================== + # 전체 데이터셋 + train_dataset = build_ytvos_ref(image_set = 'train', args = args) + + # 전체 데이터셋 메타데이터 + metas = train_dataset.metas + + # 색상 후보 8개 (RGB 형식) + colors = [ + (255, 0, 0), # Red + (0, 255, 0), # Green + (0, 0, 255), # Blue + (255, 255, 0), # Yellow + (255, 0, 255), # Magenta + (0, 255, 255), # Cyan + (128, 0, 128), # Purple + (255, 165, 0) # Orange + ] + + ytvos_category_valid_list = [ + 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', + 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', + 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', + 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', + 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake', + 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra' + ] + + #==================gpt 돌리기=================== + os.environ['OPENAI_API_KEY'] = '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' + + result_captions = {} + result_valid_obj_ids = {} + + for i in range(370): + vid_id, all_captions, valid_obj_ids = getCaption(i, True) + + if vid_id not in result_captions: + result_captions[vid_id] = all_captions + if vid_id not in result_valid_obj_ids: + result_valid_obj_ids[vid_id] = valid_obj_ids + + print("Finished!", flush=True) + + with open(args.save_caption_path, "w") as file: + json.dump(result_captions, file, indent=4) + + with open(args.save_valid_obj_ids_path, "w") as file: + json.dump(result_valid_obj_ids, file, indent=4) diff --git a/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130220417.py b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130220417.py new file mode 100644 index 0000000000000000000000000000000000000000..7edcef6aa7554657892aff2516273e8bd84a7da1 --- /dev/null +++ b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130220417.py @@ -0,0 +1,427 @@ +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from os import path as osp +from io import BytesIO + +from mbench.ytvos_ref import build as build_ytvos_ref +import argparse +import opts + +import sys +from pathlib import Path +import os +from os import path as osp +import skimage +from io import BytesIO + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle +import textwrap + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +from openai import OpenAI +import base64 +import json + +def number_objects_and_encode(idx, color_mask=False): + encoded_frames = {} + contoured_frames = {} # New dictionary for original images + vid_cat_cnts = {} + + vid_meta = metas[idx] + vid_data = train_dataset[idx] + vid_id = vid_meta['video'] + frame_indx = vid_meta['sample_indx'] + cat_names = set(vid_meta['obj_id_cat'].values()) + imgs = vid_data[0] + + for cat in cat_names: + cat_frames = [] + contour_frames = [] + frame_cat_cnts = {} + + for i in range(imgs.size(0)): + frame_name = frame_indx[i] + frame = np.copy(imgs[i].permute(1, 2, 0).numpy()) + frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) + + frame_data = vid_data[2][frame_name] + obj_ids = list(frame_data.keys()) + + cat_cnt = 0 + + for j in range(len(obj_ids)): + obj_id = obj_ids[j] + obj_data = frame_data[obj_id] + obj_bbox = obj_data['bbox'] + obj_valid = obj_data['valid'] + obj_mask = obj_data['mask'].numpy().astype(np.uint8) + obj_cat = obj_data['category_name'] + + if obj_cat == cat and obj_valid: + cat_cnt += 1 + + if color_mask == False: + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 3) + for i, contour in enumerate(contours): + # 윤곽선 중심 계산 + moments = cv2.moments(contour) + if moments["m00"] != 0: # 중심 계산 가능 여부 확인 + cx = int(moments["m10"] / moments["m00"]) + cy = int(moments["m01"] / moments["m00"]) + else: + cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용 + + # 텍스트 배경 (검은색 배경 만들기) + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + text_size = cv2.getTextSize(text, font, 1, 2)[0] + text_w, text_h = text_size + + # 텍스트 배경 그리기 (검은색 배경) + cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5), + (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1) + + # 텍스트 그리기 (흰색 텍스트) + cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2), + font, 1, (255, 255, 255), 2) + + else: + alpha = 0.08 + + colored_obj_mask = np.zeros_like(frame) + colored_obj_mask[obj_mask == 1] = colors[j] + frame[obj_mask == 1] = ( + (1 - alpha) * frame[obj_mask == 1] + + alpha * colored_obj_mask[obj_mask == 1] + ) + + + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 2) + cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2) + + + + if len(contours) > 0: + largest_contour = max(contours, key=cv2.contourArea) + M = cv2.moments(largest_contour) + if M["m00"] != 0: + center_x = int(M["m10"] / M["m00"]) + center_y = int(M["m01"] / M["m00"]) + else: + center_x, center_y = 0, 0 + + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + + font_scale = 0.9 + text_size = cv2.getTextSize(text, font, font_scale, 2)[0] + text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심 + text_y = center_y + # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심 + + # 텍스트 배경 사각형 좌표 계산 + rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단 + # rect_end = (text_x + text_size[0] + 5, text_y + 5) + rect_end = (text_x + text_size[0] + 5, text_y) + + cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) + cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2) + + # plt.figure(figsize=(12, 8)) + # plt.imshow(frame) + # plt.title(f"frame {frame_name}") + # plt.tight_layout() + # plt.axis('off') + # plt.show() + + buffer = BytesIO() + frame = Image.fromarray(frame) + frame.save(buffer, format='jpeg') + buffer.seek(0) + cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + frame_cat_cnts[frame_name] = cat_cnt + + buffer.seek(0) # Reuse buffer instead of creating a new one + buffer.truncate() + frame_for_contour = Image.fromarray(frame_for_contour) + frame_for_contour.save(buffer, format='jpeg') + buffer.seek(0) + contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + + encoded_frames[cat] = cat_frames + contoured_frames[cat] = contour_frames + vid_cat_cnts[cat] = frame_cat_cnts + + return encoded_frames, vid_cat_cnts, contoured_frames + + +def getCaption(idx, color_mask=True): + vid_meta = metas[idx] + vid_data = train_dataset[idx] + vid_id = vid_meta['video'] + print(f"vid id: {vid_id}\n") + + frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16] + cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...} + all_captions = dict() + + base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask) + marked = "mask with boundary" if color_mask else "boundary" + + for cat_name in list(cat_names) : + + is_movable = False + if cat_name in ytvos_category_valid_list : + is_movable = True + + if not is_movable: + print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n') + + + image_captions = {} + captioner = OpenAI() + cat_base64_frames = base64_frames[cat_name] + cont_base64_frames = contoured_frames[cat_name] + + for i in range(len(cat_base64_frames)): + frame_name = frame_indx[i] + cont_base64_image = cont_base64_frames[i] + base64_image = cat_base64_frames[i] + should_filter = False + frame_cat_cnts = vid_cat_cnts[cat_name][frame_name] + + if frame_cat_cnts >= 2: + should_filter = True + else: + print(f"Skipping {cat_name}: There is single or no object.", end='\n\n') + + if is_movable and should_filter: + #1단계: 필터링 + print(f"-----------category name: {cat_name}, frame name: {frame_name}") + caption_filter_text = f""" + You are a visual assistant analyzing a single frame from a video. + In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker. + + Are {cat_name}s in the image performing all different and recognizable actions or postures? + Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position), + facial expressions, and any notable interactions with objects or other {cat_name}s or people. + + Only focus on obvious, prominent actions that can be reliably identified from this single frame. + + - Respond with "YES" if: + 1) Most of {cat_name}s exhibit clearly different, unique actions or poses. + 2) You can see visible significant differences in action and posture, that an observer can identify at a glance. + 3) Each action is unambiguously recognizable and distinct. + + - Respond with "NONE" if: + 1) The actions or pose are not clearly differentiable or too similar. + 2) They show no noticeable action beyond standing or minor movements. + + Answer strictly with either "YES" or "NONE". + """ + + + response1 = captioner.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": caption_filter_text, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + } + ], + } + ], + ) + response_content = response1.choices[0].message.content + should_caption = True if "yes" in response_content.lower() else False + print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n') + + else: + should_caption = False + + #2단계: dense caption 만들기 + dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object. + In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. + I want to use your expressions to create a action-centric referring expression dataset. + Therefore, your expressions for these {cat_name}s should describe unique action of each object. + + 1. Focus only on clear, unique, and prominent actions that distinguish each object. + 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image. + 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions. + 4. Do not include common-sense or overly general descriptions like 'the elephant walks'. + 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements. + 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'. + 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'. + 8. Include interactions with objects or other entities when they are prominent and observable. + 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific. + 10. Do not include descriptions of appearance such as clothes, color, size, shape etc. + 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous. + 12. Do not mention object IDs. + 13. Use '{cat_name}' as the noun for the referring expressions. + + Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). + Output referring expressions for each object id. + """ + + dense_caption_prompt = f""" + You are a visual assistant analyzing a single frame of a video. + In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. + I want to use your expressions to create a action-centric referring expression dataset. + Please describe each {cat_name} using **clearly observable** and **specific** actions. + + ## Guidelines: + 1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object). + 2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw). + 3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”). + 4. Do not use vague expressions like "interacting with something"** or "engaging with another object." + Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button"). + 5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction. + 6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions. + 7. Base your description on the following action definitions: + - Facial with object manipulation + - General body movement, body position or pattern + - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object"). + - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone"). + + ## Output Format: + - For each labeled {cat_name}, output one line in the format: + ID. action-oriented description + + Example: + 1. a bear grasping the edge of a wood with its front paws + 2. the bear pushing another bear, leaning forward + + **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”). + **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines. + Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). + For each labeled {cat_name}, output referring expressions for each object id. + """ + if should_caption: + response2 = captioner.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": dense_caption_prompt, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + + caption = response2.choices[0].message.content + #print(f"{image_path} - {frame_name}: {caption}") + else: + caption = None + + image_captions[frame_name] = caption + all_captions[cat_name] = image_captions + + # final : also prepare valid object ids + valid_obj_ids = dict() + + for cat in cat_names: + if cat in ytvos_category_valid_list: + obj_id_cat = vid_meta['obj_id_cat'] + valid_cat_ids = [] + for obj_id in list(obj_id_cat.keys()): + if obj_id_cat[obj_id] == cat: + valid_cat_ids.append(obj_id) + valid_obj_ids[cat] = valid_cat_ids + + return vid_id, all_captions, valid_obj_ids + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json") + parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json") + + args = parser.parse_args() + + #==================데이터 불러오기=================== + # 전체 데이터셋 + train_dataset = build_ytvos_ref(image_set = 'train', args = args) + + # 전체 데이터셋 메타데이터 + metas = train_dataset.metas + + # 색상 후보 8개 (RGB 형식) + colors = [ + (255, 0, 0), # Red + (0, 255, 0), # Green + (0, 0, 255), # Blue + (255, 255, 0), # Yellow + (255, 0, 255), # Magenta + (0, 255, 255), # Cyan + (128, 0, 128), # Purple + (255, 165, 0) # Orange + ] + + ytvos_category_valid_list = [ + 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', + 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', + 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', + 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', + 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake', + 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra' + ] + + #==================gpt 돌리기=================== + os.environ['OPENAI_API_KEY'] = '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' + + result_captions = {} + result_valid_obj_ids = {} + + for i in range(370): + vid_id, all_captions, valid_obj_ids = getCaption(i, True) + + if vid_id not in result_captions: + result_captions[vid_id] = all_captions + if vid_id not in result_valid_obj_ids: + result_valid_obj_ids[vid_id] = valid_obj_ids + + print("Finished!", flush=True) + + with open(args.save_caption_path, "w") as file: + json.dump(result_captions, file, indent=4) + + with open(args.save_valid_obj_ids_path, "w") as file: + json.dump(result_valid_obj_ids, file, indent=4) diff --git a/.history/mbench/gpt_ref-ytvos_numbered_cy_20250201140559.py b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250201140559.py new file mode 100644 index 0000000000000000000000000000000000000000..48dc049fb725cde5fd97d6e89935ecf0286ba0d2 --- /dev/null +++ b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250201140559.py @@ -0,0 +1,461 @@ +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +import time + +from os import path as osp +from io import BytesIO + +from mbench.ytvos_ref import build as build_ytvos_ref +import argparse +import opts + +import sys +from pathlib import Path +import os +from os import path as osp +import skimage +from io import BytesIO + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle +import textwrap + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +from openai import OpenAI +import base64 +import json + +def number_objects_and_encode(idx, color_mask=False): + encoded_frames = {} + contoured_frames = {} # New dictionary for original images + vid_cat_cnts = {} + + vid_meta = metas[idx] + vid_data = train_dataset[idx] + vid_id = vid_meta['video'] + frame_indx = vid_meta['sample_indx'] + cat_names = set(vid_meta['obj_id_cat'].values()) + imgs = vid_data[0] + + for cat in cat_names: + cat_frames = [] + contour_frames = [] + frame_cat_cnts = {} + + for i in range(imgs.size(0)): + frame_name = frame_indx[i] + frame = np.copy(imgs[i].permute(1, 2, 0).numpy()) + frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) + + frame_data = vid_data[2][frame_name] + obj_ids = list(frame_data.keys()) + + cat_cnt = 0 + + for j in range(len(obj_ids)): + obj_id = obj_ids[j] + obj_data = frame_data[obj_id] + obj_bbox = obj_data['bbox'] + obj_valid = obj_data['valid'] + obj_mask = obj_data['mask'].numpy().astype(np.uint8) + obj_cat = obj_data['category_name'] + + if obj_cat == cat and obj_valid: + cat_cnt += 1 + + if color_mask == False: + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 3) + for i, contour in enumerate(contours): + # 윤곽선 중심 계산 + moments = cv2.moments(contour) + if moments["m00"] != 0: # 중심 계산 가능 여부 확인 + cx = int(moments["m10"] / moments["m00"]) + cy = int(moments["m01"] / moments["m00"]) + else: + cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용 + + # 텍스트 배경 (검은색 배경 만들기) + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + text_size = cv2.getTextSize(text, font, 1, 2)[0] + text_w, text_h = text_size + + # 텍스트 배경 그리기 (검은색 배경) + cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5), + (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1) + + # 텍스트 그리기 (흰색 텍스트) + cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2), + font, 1, (255, 255, 255), 2) + + else: + alpha = 0.08 + + colored_obj_mask = np.zeros_like(frame) + colored_obj_mask[obj_mask == 1] = colors[j] + frame[obj_mask == 1] = ( + (1 - alpha) * frame[obj_mask == 1] + + alpha * colored_obj_mask[obj_mask == 1] + ) + + + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 2) + cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2) + + + + if len(contours) > 0: + largest_contour = max(contours, key=cv2.contourArea) + M = cv2.moments(largest_contour) + if M["m00"] != 0: + center_x = int(M["m10"] / M["m00"]) + center_y = int(M["m01"] / M["m00"]) + else: + center_x, center_y = 0, 0 + + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + + font_scale = 0.9 + text_size = cv2.getTextSize(text, font, font_scale, 2)[0] + text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심 + text_y = center_y + # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심 + + # 텍스트 배경 사각형 좌표 계산 + rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단 + # rect_end = (text_x + text_size[0] + 5, text_y + 5) + rect_end = (text_x + text_size[0] + 5, text_y) + + cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) + cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2) + + # plt.figure(figsize=(12, 8)) + # plt.imshow(frame) + # plt.title(f"frame {frame_name}") + # plt.tight_layout() + # plt.axis('off') + # plt.show() + + buffer = BytesIO() + frame = Image.fromarray(frame) + frame.save(buffer, format='jpeg') + buffer.seek(0) + cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + frame_cat_cnts[frame_name] = cat_cnt + + buffer.seek(0) # Reuse buffer instead of creating a new one + buffer.truncate() + frame_for_contour = Image.fromarray(frame_for_contour) + frame_for_contour.save(buffer, format='jpeg') + buffer.seek(0) + contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + + encoded_frames[cat] = cat_frames + contoured_frames[cat] = contour_frames + vid_cat_cnts[cat] = frame_cat_cnts + + return encoded_frames, vid_cat_cnts, contoured_frames + + +def getCaption(idx, model='gpt-4o', color_mask=True): + vid_meta = metas[idx] + vid_data = train_dataset[idx] + vid_id = vid_meta['video'] + print(f"vid id: {vid_id}\n") + + frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16] + cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...} + all_captions = dict() + + base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask) + #marked = "mask with boundary" if color_mask else "boundary" + + for cat_name in list(cat_names) : + + is_movable = False + if cat_name in ytvos_category_valid_list : + is_movable = True + + if not is_movable: + print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n') + + + image_captions = {} + captioner = OpenAI() + cat_base64_frames = base64_frames[cat_name] + cont_base64_frames = contoured_frames[cat_name] + + for i in range(len(cat_base64_frames)): + frame_name = frame_indx[i] + cont_base64_image = cont_base64_frames[i] + base64_image = cat_base64_frames[i] + should_filter = False + frame_cat_cnts = vid_cat_cnts[cat_name][frame_name] + + if frame_cat_cnts >= 2: + should_filter = True + else: + print(f"Skipping {cat_name}: There is single or no object.", end='\n\n') + + if is_movable and should_filter: + #1단계: 필터링 + print(f"-----------category name: {cat_name}, frame name: {frame_name}") + caption_filter_text = f""" + You are a visual assistant analyzing a single frame from a video. + In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker. + + Are {cat_name}s in the image performing all different and recognizable actions or postures? + Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position), + facial expressions, and any notable interactions with objects or other {cat_name}s or people. + + Only focus on obvious, prominent actions that can be reliably identified from this single frame. + + - Respond with "YES" if: + 1) Most of {cat_name}s exhibit clearly different, unique actions or poses. + (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.) + 2) You can see visible significant differences in action and posture, that an observer can identify at a glance. + 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing. + + - Respond with "NONE" if: + 1) The actions or pose are not clearly differentiable or too similar. + 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance. + 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion + + Answer strictly with either "YES" or "NONE". + """ + + response1 = captioner.chat.completions.create( + # model="chatgpt-4o-latest", + model=model, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": caption_filter_text, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + } + ], + } + ], + ) + response_content = response1.choices[0].message.content + should_caption = True if "yes" in response_content.lower() else False + print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n') + + else: + should_caption = False + + #2단계: dense caption 만들기 + dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object. + In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. + I want to use your expressions to create a action-centric referring expression dataset. + Therefore, your expressions for these {cat_name}s should describe unique action of each object. + + 1. Focus only on clear, unique, and prominent actions that distinguish each object. + 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image. + 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions. + 4. Do not include common-sense or overly general descriptions like 'the elephant walks'. + 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements. + 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'. + 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'. + 8. Include interactions with objects or other entities when they are prominent and observable. + 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific. + 10. Do not include descriptions of appearance such as clothes, color, size, shape etc. + 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous. + 12. Do not mention object IDs. + 13. Use '{cat_name}' as the noun for the referring expressions. + + Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). + Output referring expressions for each object id. + """ + + dense_caption_prompt = f""" + You are a visual assistant analyzing a single frame of a video. + In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. + + I want to use your expressions to create an **action-centric referring expression** dataset. + Please describe each {cat_name} using **clearly observable** and **specific** actions. + + --- + ## Guidelines: + 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object). + 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head"). + 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”). + 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button"). + 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction. + 6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions. + 7. Base your description on these action definitions: + - Avoid using term 'minimal' or 'slightly'. + - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back") + - details such as motion and intention, facial with object manipulation + - movements with objects or other entities when they are prominent and observable. expression should be specific. + (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X)) + --- + + ## Output Format: + - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format : + object id. using {cat_name} as subject noun, action-oriented description + (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.) + - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal). + + ### Example + If the frame has 2 labeled bears, your output should look like: + 1. the bear reaching his right arm while leaning forward to capture the prey + 2. a bear standing upright facing right, touching the bike aside + + --- + **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”). + **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed). + **Do not include markdown** in the output. + Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). + For each labeled {cat_name}, output referring expressions for each object id. + """ + MAX_RETRIES = 2 + retry_count = 0 + + if should_caption: + while retry_count < MAX_RETRIES: + + response2 = captioner.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": dense_caption_prompt, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + + # caption = response2.choices[0].message.content + #print(f"{image_path} - {frame_name}: {caption}") + + caption = response2.choices[0].message.content.strip() + caption_lower = caption.lower().lstrip() + + if caption_lower.startswith("1.") and not any( + phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"] + ): + break + + print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})") + retry_count += 1 + time.sleep(2) + + if retry_count == MAX_RETRIES: + caption = None + print("Max retries reached. Caption generation failed.") + + else: + caption = None + + image_captions[frame_name] = caption + all_captions[cat_name] = image_captions + + # final : also prepare valid object ids + valid_obj_ids = dict() + + for cat in cat_names: + if cat in ytvos_category_valid_list: + obj_id_cat = vid_meta['obj_id_cat'] + valid_cat_ids = [] + for obj_id in list(obj_id_cat.keys()): + if obj_id_cat[obj_id] == cat: + valid_cat_ids.append(obj_id) + valid_obj_ids[cat] = valid_cat_ids + + return all_captions, valid_obj_ids + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json") + parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json") + + args = parser.parse_args() + + #==================데이터 불러오기=================== + # 전체 데이터셋 + train_dataset = build_ytvos_ref(image_set = 'train', args = args) + + # 전체 데이터셋 메타데이터 + metas = train_dataset.metas + + # 색상 후보 8개 (RGB 형식) + colors = [ + (255, 0, 0), # Red + (0, 255, 0), # Green + (0, 0, 255), # Blue + (255, 255, 0), # Yellow + (255, 0, 255), # Magenta + (0, 255, 255), # Cyan + (128, 0, 128), # Purple + (255, 165, 0) # Orange + ] + + ytvos_category_valid_list = [ + 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', + 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', + 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', + 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', + 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake', + 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra' + ] + + #==================gpt 돌리기=================== + os.environ['OPENAI_API_KEY'] = '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' + + result_captions = {} + result_valid_obj_ids = {} + + for i in range(370): + vid_id, all_captions, valid_obj_ids = getCaption(i, True) + + if vid_id not in result_captions: + result_captions[vid_id] = all_captions + if vid_id not in result_valid_obj_ids: + result_valid_obj_ids[vid_id] = valid_obj_ids + + print("Finished!", flush=True) + + with open(args.save_caption_path, "w") as file: + json.dump(result_captions, file, indent=4) + + with open(args.save_valid_obj_ids_path, "w") as file: + json.dump(result_valid_obj_ids, file, indent=4) diff --git a/.history/mbench/gpt_ref-ytvos_numbered_cy_20250201141240.py b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250201141240.py new file mode 100644 index 0000000000000000000000000000000000000000..ac10a64448640a89e3d7c035abaf10fcf5d68b7f --- /dev/null +++ b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250201141240.py @@ -0,0 +1,460 @@ +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +import time + +from os import path as osp +from io import BytesIO + +from mbench.ytvos_ref import build as build_ytvos_ref +import argparse +import opts + +import sys +from pathlib import Path +import os +from os import path as osp +import skimage +from io import BytesIO + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle +import textwrap + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +from openai import OpenAI +import base64 +import json + +def number_objects_and_encode(idx, color_mask=False): + encoded_frames = {} + contoured_frames = {} # New dictionary for original images + vid_cat_cnts = {} + + vid_meta = metas[idx] + vid_data = train_dataset[idx] + vid_id = vid_meta['video'] + frame_indx = vid_meta['sample_indx'] + cat_names = set(vid_meta['obj_id_cat'].values()) + imgs = vid_data[0] + + for cat in cat_names: + cat_frames = [] + contour_frames = [] + frame_cat_cnts = {} + + for i in range(imgs.size(0)): + frame_name = frame_indx[i] + frame = np.copy(imgs[i].permute(1, 2, 0).numpy()) + frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) + + frame_data = vid_data[2][frame_name] + obj_ids = list(frame_data.keys()) + + cat_cnt = 0 + + for j in range(len(obj_ids)): + obj_id = obj_ids[j] + obj_data = frame_data[obj_id] + obj_bbox = obj_data['bbox'] + obj_valid = obj_data['valid'] + obj_mask = obj_data['mask'].numpy().astype(np.uint8) + obj_cat = obj_data['category_name'] + + if obj_cat == cat and obj_valid: + cat_cnt += 1 + + if color_mask == False: + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 3) + for i, contour in enumerate(contours): + # 윤곽선 중심 계산 + moments = cv2.moments(contour) + if moments["m00"] != 0: # 중심 계산 가능 여부 확인 + cx = int(moments["m10"] / moments["m00"]) + cy = int(moments["m01"] / moments["m00"]) + else: + cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용 + + # 텍스트 배경 (검은색 배경 만들기) + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + text_size = cv2.getTextSize(text, font, 1, 2)[0] + text_w, text_h = text_size + + # 텍스트 배경 그리기 (검은색 배경) + cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5), + (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1) + + # 텍스트 그리기 (흰색 텍스트) + cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2), + font, 1, (255, 255, 255), 2) + + else: + alpha = 0.08 + + colored_obj_mask = np.zeros_like(frame) + colored_obj_mask[obj_mask == 1] = colors[j] + frame[obj_mask == 1] = ( + (1 - alpha) * frame[obj_mask == 1] + + alpha * colored_obj_mask[obj_mask == 1] + ) + + + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 2) + cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2) + + + + if len(contours) > 0: + largest_contour = max(contours, key=cv2.contourArea) + M = cv2.moments(largest_contour) + if M["m00"] != 0: + center_x = int(M["m10"] / M["m00"]) + center_y = int(M["m01"] / M["m00"]) + else: + center_x, center_y = 0, 0 + + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + + font_scale = 0.9 + text_size = cv2.getTextSize(text, font, font_scale, 2)[0] + text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심 + text_y = center_y + # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심 + + # 텍스트 배경 사각형 좌표 계산 + rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단 + # rect_end = (text_x + text_size[0] + 5, text_y + 5) + rect_end = (text_x + text_size[0] + 5, text_y) + + cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) + cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2) + + # plt.figure(figsize=(12, 8)) + # plt.imshow(frame) + # plt.title(f"frame {frame_name}") + # plt.tight_layout() + # plt.axis('off') + # plt.show() + + buffer = BytesIO() + frame = Image.fromarray(frame) + frame.save(buffer, format='jpeg') + buffer.seek(0) + cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + frame_cat_cnts[frame_name] = cat_cnt + + buffer.seek(0) # Reuse buffer instead of creating a new one + buffer.truncate() + frame_for_contour = Image.fromarray(frame_for_contour) + frame_for_contour.save(buffer, format='jpeg') + buffer.seek(0) + contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + + encoded_frames[cat] = cat_frames + contoured_frames[cat] = contour_frames + vid_cat_cnts[cat] = frame_cat_cnts + + return encoded_frames, vid_cat_cnts, contoured_frames + + +def getCaption(idx, model='gpt-4o', color_mask=True): + vid_meta = metas[idx] + vid_data = train_dataset[idx] + vid_id = vid_meta['video'] + print(f"vid id: {vid_id}\n") + + frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16] + cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...} + all_captions = dict() + + base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask) + #marked = "mask with boundary" if color_mask else "boundary" + + for cat_name in list(cat_names) : + + is_movable = False + if cat_name in ytvos_category_valid_list : + is_movable = True + + if not is_movable: + print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n') + + + image_captions = {} + captioner = OpenAI() + cat_base64_frames = base64_frames[cat_name] + cont_base64_frames = contoured_frames[cat_name] + + for i in range(len(cat_base64_frames)): + frame_name = frame_indx[i] + cont_base64_image = cont_base64_frames[i] + base64_image = cat_base64_frames[i] + should_filter = False + frame_cat_cnts = vid_cat_cnts[cat_name][frame_name] + + if frame_cat_cnts >= 2: + should_filter = True + else: + print(f"Skipping {cat_name}: There is single or no object.", end='\n\n') + + if is_movable and should_filter: + #1단계: 필터링 + print(f"-----------category name: {cat_name}, frame name: {frame_name}") + caption_filter_text = f""" + You are a visual assistant analyzing a single frame from a video. + In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker. + + Are {cat_name}s in the image performing all different and recognizable actions or postures? + Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position), + facial expressions, and any notable interactions with objects or other {cat_name}s or people. + + Only focus on obvious, prominent actions that can be reliably identified from this single frame. + + - Respond with "YES" if: + 1) Most of {cat_name}s exhibit clearly different, unique actions or poses. + (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.) + 2) You can see visible significant differences in action and posture, that an observer can identify at a glance. + 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing. + + - Respond with "NONE" if: + 1) The actions or pose are not clearly differentiable or too similar. + 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance. + 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion + + Answer strictly with either "YES" or "NONE". + """ + + response1 = captioner.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": caption_filter_text, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + } + ], + } + ], + ) + response_content = response1.choices[0].message.content + should_caption = True if "yes" in response_content.lower() else False + print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n') + + else: + should_caption = False + + #2단계: dense caption 만들기 + dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object. + In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. + I want to use your expressions to create a action-centric referring expression dataset. + Therefore, your expressions for these {cat_name}s should describe unique action of each object. + + 1. Focus only on clear, unique, and prominent actions that distinguish each object. + 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image. + 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions. + 4. Do not include common-sense or overly general descriptions like 'the elephant walks'. + 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements. + 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'. + 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'. + 8. Include interactions with objects or other entities when they are prominent and observable. + 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific. + 10. Do not include descriptions of appearance such as clothes, color, size, shape etc. + 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous. + 12. Do not mention object IDs. + 13. Use '{cat_name}' as the noun for the referring expressions. + + Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). + Output referring expressions for each object id. + """ + + dense_caption_prompt = f""" + You are a visual assistant analyzing a single frame of a video. + In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. + + I want to use your expressions to create an **action-centric referring expression** dataset. + Please describe each {cat_name} using **clearly observable** and **specific** actions. + + --- + ## Guidelines: + 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object). + 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head"). + 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”). + 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button"). + 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction. + 6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions. + 7. Base your description on these action definitions: + - Avoid using term 'minimal' or 'slightly'. + - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back") + - details such as motion and intention, facial with object manipulation + - movements with objects or other entities when they are prominent and observable. expression should be specific. + (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X)) + --- + + ## Output Format: + - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format : + object id. using {cat_name} as subject noun, action-oriented description + (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.) + - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal). + + ### Example + If the frame has 2 labeled bears, your output should look like: + 1. the bear reaching his right arm while leaning forward to capture the prey + 2. a bear standing upright facing right, touching the bike aside + + --- + **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”). + **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed). + **Do not include markdown** in the output. + Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). + For each labeled {cat_name}, output referring expressions for each object id. + """ + MAX_RETRIES = 2 + retry_count = 0 + + if should_caption: + while retry_count < MAX_RETRIES: + + response2 = captioner.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": dense_caption_prompt, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + + # caption = response2.choices[0].message.content + #print(f"{image_path} - {frame_name}: {caption}") + + caption = response2.choices[0].message.content.strip() + caption_lower = caption.lower().lstrip() + + if caption_lower.startswith("1.") and not any( + phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"] + ): + break + + print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})") + retry_count += 1 + time.sleep(2) + + if retry_count == MAX_RETRIES: + caption = None + print("Max retries reached. Caption generation failed.") + + else: + caption = None + + image_captions[frame_name] = caption + all_captions[cat_name] = image_captions + + # final : also prepare valid object ids + valid_obj_ids = dict() + + for cat in cat_names: + if cat in ytvos_category_valid_list: + obj_id_cat = vid_meta['obj_id_cat'] + valid_cat_ids = [] + for obj_id in list(obj_id_cat.keys()): + if obj_id_cat[obj_id] == cat: + valid_cat_ids.append(obj_id) + valid_obj_ids[cat] = valid_cat_ids + + return all_captions, valid_obj_ids + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json") + parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json") + + args = parser.parse_args() + + #==================데이터 불러오기=================== + # 전체 데이터셋 + train_dataset = build_ytvos_ref(image_set = 'train', args = args) + + # 전체 데이터셋 메타데이터 + metas = train_dataset.metas + + # 색상 후보 8개 (RGB 형식) + colors = [ + (255, 0, 0), # Red + (0, 255, 0), # Green + (0, 0, 255), # Blue + (255, 255, 0), # Yellow + (255, 0, 255), # Magenta + (0, 255, 255), # Cyan + (128, 0, 128), # Purple + (255, 165, 0) # Orange + ] + + ytvos_category_valid_list = [ + 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', + 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', + 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', + 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', + 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake', + 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra' + ] + + #==================gpt 돌리기=================== + os.environ['OPENAI_API_KEY'] = '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' + + result_captions = {} + result_valid_obj_ids = {} + + for i in range(370): + vid_id, all_captions, valid_obj_ids = getCaption(i, True) + + if vid_id not in result_captions: + result_captions[vid_id] = all_captions + if vid_id not in result_valid_obj_ids: + result_valid_obj_ids[vid_id] = valid_obj_ids + + print("Finished!", flush=True) + + with open(args.save_caption_path, "w") as file: + json.dump(result_captions, file, indent=4) + + with open(args.save_valid_obj_ids_path, "w") as file: + json.dump(result_valid_obj_ids, file, indent=4) diff --git a/.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207172754.py b/.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207172754.py new file mode 100644 index 0000000000000000000000000000000000000000..c5efab129d003d0163b5c6bd9a01eb4d3942a054 --- /dev/null +++ b/.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207172754.py @@ -0,0 +1,656 @@ +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +import time + +from os import path as osp +from io import BytesIO +import random + +from mbench.ytvos_ref import build as build_ytvos_ref +import argparse +import opts + +import sys +from pathlib import Path +import os +from os import path as osp +import skimage +from io import BytesIO + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle +import textwrap + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +from openai import OpenAI +import base64 +import json +import requests +from openai.error import APIConnectionError, OpenAIError + +def number_objects_and_encode_old(idx, color_mask=False): + encoded_frames = {} + contoured_frames = {} # New dictionary for original images + vid_cat_cnts = {} + + vid_meta = metas[idx] + vid_data = train_dataset[idx] + vid_id = vid_meta['video'] + frame_indx = vid_meta['sample_indx'] + cat_names = set(vid_meta['obj_id_cat'].values()) + imgs = vid_data[0] + + for cat in cat_names: + cat_frames = [] + contour_frames = [] + frame_cat_cnts = {} + + for i in range(imgs.size(0)): + frame_name = frame_indx[i] + frame = np.copy(imgs[i].permute(1, 2, 0).numpy()) + frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) + + frame_data = vid_data[2][frame_name] + obj_ids = list(frame_data.keys()) + + cat_cnt = 0 + + for j in range(len(obj_ids)): + obj_id = obj_ids[j] + obj_data = frame_data[obj_id] + obj_bbox = obj_data['bbox'] + obj_valid = obj_data['valid'] + obj_mask = obj_data['mask'].numpy().astype(np.uint8) + obj_cat = obj_data['category_name'] + + if obj_cat == cat and obj_valid: + cat_cnt += 1 + + if color_mask == False: + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 3) + for i, contour in enumerate(contours): + moments = cv2.moments(contour) + if moments["m00"] != 0: + cx = int(moments["m10"] / moments["m00"]) + cy = int(moments["m01"] / moments["m00"]) + else: + cx, cy = contour[0][0] + + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + text_size = cv2.getTextSize(text, font, 1, 2)[0] + text_w, text_h = text_size + + cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5), + (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1) + + cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2), + font, 1, (255, 255, 255), 2) + + else: + alpha = 0.08 + + colored_obj_mask = np.zeros_like(frame) + colored_obj_mask[obj_mask == 1] = colors[j] + frame[obj_mask == 1] = ( + (1 - alpha) * frame[obj_mask == 1] + + alpha * colored_obj_mask[obj_mask == 1] + ) + + + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 2) + cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2) + + if len(contours) > 0: + largest_contour = max(contours, key=cv2.contourArea) + M = cv2.moments(largest_contour) + if M["m00"] != 0: + center_x = int(M["m10"] / M["m00"]) + center_y = int(M["m01"] / M["m00"]) + else: + center_x, center_y = 0, 0 + + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + + font_scale = 0.9 + text_size = cv2.getTextSize(text, font, font_scale, 2)[0] + text_x = center_x - text_size[0] // 1 + text_y = center_y + + rect_start = (text_x - 5, text_y - text_size[1] - 5) + rect_end = (text_x + text_size[0] + 5, text_y) + + cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) + cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2) + + # plt.figure(figsize=(12, 8)) + # plt.imshow(frame) + # plt.title(f"frame {frame_name}") + # plt.tight_layout() + # plt.axis('off') + # plt.show() + + buffer = BytesIO() + frame = Image.fromarray(frame) + frame.save(buffer, format='jpeg') + buffer.seek(0) + cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + frame_cat_cnts[frame_name] = cat_cnt + + buffer.seek(0) # Reuse buffer instead of creating a new one + buffer.truncate() + frame_for_contour = Image.fromarray(frame_for_contour) + frame_for_contour.save(buffer, format='jpeg') + buffer.seek(0) + contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + + encoded_frames[cat] = cat_frames + contoured_frames[cat] = contour_frames + vid_cat_cnts[cat] = frame_cat_cnts + + return encoded_frames, contoured_frames, vid_cat_cnts + + +def number_objects_and_encode(idx, color_mask=False): + encoded_frames = {} + contoured_frames = {} # New dictionary for original images + vid_cat_cnts = {} + + vid_meta = metas[idx] + vid_data = train_dataset[idx] + vid_id = vid_meta['video'] + frame_indx = vid_meta['sample_indx'] + cat_names = set(vid_meta['obj_id_cat'].values()) + imgs = vid_data[0] + + for cat in cat_names: + cat_frames = [] + contour_frames = [] + frame_cat_cnts = {} + + for i in range(imgs.size(0)): + frame_name = frame_indx[i] + frame = np.copy(imgs[i].permute(1, 2, 0).numpy()) + frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) + + frame_data = vid_data[2][frame_name] + obj_ids = list(frame_data.keys()) + + cat_cnt = 0 + + for j in range(len(obj_ids)): + obj_id = obj_ids[j] + obj_data = frame_data[obj_id] + obj_bbox = obj_data['bbox'] + obj_valid = obj_data['valid'] + obj_mask = obj_data['mask'].numpy().astype(np.uint8) + obj_cat = obj_data['category_name'] + + if obj_cat == cat and obj_valid: + cat_cnt += 1 + + contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, colors[j], 3) + cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2) + + if len(contours) > 0: + largest_contour = max(contours, key=cv2.contourArea) + M = cv2.moments(largest_contour) + if M["m00"] != 0: + center_x = int(M["m10"] / M["m00"]) + center_y = int(M["m01"] / M["m00"]) + else: + center_x, center_y = 0, 0 + + font = cv2.FONT_HERSHEY_SIMPLEX + text = obj_id + font_scale = 1.2 + text_size = cv2.getTextSize(text, font, font_scale, 2)[0] + text_x = center_x - text_size[0] // 1 + text_y = center_y + + rect_start = (text_x - 5, text_y - text_size[1] - 5) + rect_end = (text_x + text_size[0] + 5, text_y + 3) + + contour_thickness = 1 + rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness) + rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness) + + cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness) + cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) + cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2) + + + if color_mask: + alpha = 0.08 + colored_obj_mask = np.zeros_like(frame) + colored_obj_mask[obj_mask == 1] = colors[j] + frame[obj_mask == 1] = ( + (1 - alpha) * frame[obj_mask == 1] + + alpha * colored_obj_mask[obj_mask == 1] + ) + + # plt.figure(figsize=(12, 8)) + # plt.imshow(frame) + # plt.title(f"frame {frame_name}") + # plt.tight_layout() + # plt.axis('off') + # plt.show() + + buffer = BytesIO() + frame = Image.fromarray(frame) + frame.save(buffer, format='jpeg') + buffer.seek(0) + cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + frame_cat_cnts[frame_name] = cat_cnt + + buffer.seek(0) # Reuse buffer instead of creating a new one + buffer.truncate() + frame_for_contour = Image.fromarray(frame_for_contour) + frame_for_contour.save(buffer, format='jpeg') + buffer.seek(0) + contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) + + encoded_frames[cat] = cat_frames + contoured_frames[cat] = contour_frames + vid_cat_cnts[cat] = frame_cat_cnts + + return encoded_frames, contoured_frames, vid_cat_cnts + + + +def getCaption(idx, model='gpt-4o'): + vid_meta = metas[idx] + vid_data = train_dataset[idx] + vid_id = vid_meta['video'] + print(f"vid id: {vid_id}\n") + + frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16] + cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...} + all_captions = dict() + + # color_mask = random.choice([True, False]) + color_mask = random.choices([False, True], weights=[60, 40])[0] + + base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask) + #marked = "mask with boundary" if color_mask else "boundary" + + for cat_name in list(cat_names) : + + is_movable = False + if cat_name in ytvos_category_valid_list : + is_movable = True + + if not is_movable: + print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n') + + + image_captions = {} + captioner = OpenAI() + cat_base64_frames = base64_frames[cat_name] + # cont_base64_frames = contoured_frames[cat_name] + + for i in range(len(cat_base64_frames)): + frame_name = frame_indx[i] + # cont_base64_image = cont_base64_frames[i] + base64_image = cat_base64_frames[i] + should_filter = False + frame_cat_cnts = vid_cat_cnts[cat_name][frame_name] + + if frame_cat_cnts >= 2: + should_filter = True + else: + print(f"Skipping {cat_name}: There is single or no object.", end='\n\n') + + + if is_movable and should_filter: + #1단계: 필터링 + print(f"-----------category name: {cat_name}, frame name: {frame_name}") + caption_filter_text = f""" + You are a visual assistant analyzing a single frame from a video. + In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker. + + Are {cat_name}s in the image performing all different and recognizable actions or postures? + Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position), + facial expressions, and any notable interactions with objects or other {cat_name}s or people. + + Only focus on obvious, prominent actions that can be reliably identified from this single frame. + + - Respond with "YES" if: + 1) Most of {cat_name}s exhibit clearly different, unique actions or poses. + (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.) + 2) You can see visible significant differences in action and posture, that an observer can identify at a glance. + 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing. + + - Respond with "NONE" if: + 1) The actions or pose are not clearly differentiable or too similar. + 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance. + 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion + + Answer strictly with either "YES" or "NONE". + """ + + response1 = captioner.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": caption_filter_text, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + } + ], + } + ], + ) + response_content = response1.choices[0].message.content + should_caption = True if "yes" in response_content.lower() else False + print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n') + + else: + should_caption = False + + #2단계: dense caption 만들기 + dense_caption_prompt_1 = f""" + In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}. + + Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions. + + 1. Focus only on clear, unique, and prominent actions that distinguish each object. + 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image. + 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions. + 4. Do not include common-sense or overly general descriptions like 'the elephant walks'. + 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements. + 6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'. + - expressions like 'seems to be', 'appears to be' are BANNED! + 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'. + 8. Include interactions with objects or other entities when they are prominent and observable. + 9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc. + 10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous. + 11. Do not mention object IDs. + 12. Use '{cat_name}' as the noun for the referring expressions. + + Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific. + + - Your answer should contain details, and follow the following format: + object id. action-oriented description + (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right. + 2. a person bending over and touching his boots to tie the shoelace.) + - for action-oriented description, use {cat_name} as subject noun + + **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal). + Please pay attention to the categories of these objects and don’t change them. + Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). + Output referring expressions for each object id. Please start your answer:""" + + + dense_caption_prompt_2 = f""" + You are an advanced visual language model analyzing a video frame. + In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary. + + Your task is to generate **action-oriented descriptions** for each labeled {cat_name}. + Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors. + + --- + ## Key Guidelines: + 1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing. + - Example: "grabbing a branch and pulling it down" (**(O) Specific**) + - Avoid: "moving slightly to the side" (**(X) Too vague**) + + 2. **Do not describe appearance, color, or position**—focus purely on the action. + - (X) "A large brown bear standing on the left" + - (O) "The bear is lifting its front paws and swiping forward." + + 3. **Use dynamic, action-specific verbs** rather than passive descriptions. + - (O) "The giraffe is tilting its head and sniffing the ground." + - (X) "The giraffe is near a tree and looking around." + + 4. **Avoid assumptions, emotions, or speculative phrasing.** + - (X) "The person seems excited" / "The person might be preparing to jump." + - (O) "The person is pushing its front legs against the rock and leaping forward." + + 5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'. + - expressions like 'seems to be', 'appears to be' are BANNED! + 6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'. + + 7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**. + - **Each object should have a unique, descriptive action.** + - (X) "Two dogs are running." + - (O) "1. One dog is chasing another, its legs stretched mid-air. + 2. The other dog is looking back while speeding up." + + --- + ## Output Format: + - Each labeled **{cat_name}** should have exactly **one line of description**. + - Format: `ID. {cat_name} + action-based description` + - (O) Example: + ``` + 1. The person is leaning forward while opening a bag with both hands. + 2. The person is holding onto a rope and pulling themselves up. + ``` + - **Ensure that each object is described individually.** + - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed). + + --- + ## Additional Instructions: + - **Do NOT** use expressions like "it appears that..." or "it seems like...". + - **Do NOT** mention object IDs in the description (only use the provided format). + - **DO NOT** include markdown formatting (no bullet points, no asterisks). + - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories. + + Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer: + """ + + + dense_caption_prompt = f""" + You are a visual assistant analyzing a single frame of a video. + In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary. + + I am building an **action-centric referring expression** dataset. + Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**. + + --- + ## Guidelines: + 1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object). + 2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit"). + 3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump"). + 4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button"). + 5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction. + 6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**. + 7. Base your descriptions on these principles: + - **Avoid words like 'minimal' or 'slightly'.** + - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back"). + - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item"). + - **Specify actions with other objects or entities** only when they are clear and observable. + - (O) "pushing another person" + - (X) "interacting with another object" + + --- + ## Output Format: + - Each labeled **{cat_name}** must have **exactly one line**. + - Format: `ID. {cat_name} + action-based description` + - (O) Example: + ``` + 1. The person is holding ski poles and skiing down a snowy mountain with bent knees. + 2. The person is pulling a baby carriage while smiling. + ``` + - **Ensure each object is described individually.** + - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed). + + --- + ## Example: + If the frame has two labeled **bears**, your output should be: + ``` + 1. The bear is reaching out its right paw while leaning forward to catch prey. + 2. A bear is standing upright, facing right, and touching the bike beside it. + ``` + + --- + ## Additional Instructions: + - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right"). + - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed). + - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols). + - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories. + + Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:""" + + + MAX_RETRIES = 3 + retry_count = 0 + + if should_caption: + while retry_count < MAX_RETRIES: + selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2]) + + response2 = captioner.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": selected_prompt, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + + # caption = response2.choices[0].message.content + #print(f"{image_path} - {frame_name}: {caption}") + + caption = response2.choices[0].message.content.strip() + caption_lower = caption.lower().lstrip() + + if caption_lower.startswith("1.") and not any( + phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"] + ): + break + + print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})") + retry_count += 1 + time.sleep(2) + + if retry_count == MAX_RETRIES: + caption = None + print("Max retries reached. Caption generation failed.") + + else: + caption = None + + image_captions[frame_name] = caption + all_captions[cat_name] = image_captions + + # final : also prepare valid object ids + valid_obj_ids = dict() + + for cat in cat_names: + if cat in ytvos_category_valid_list: + obj_id_cat = vid_meta['obj_id_cat'] + valid_cat_ids = [] + for obj_id in list(obj_id_cat.keys()): + if obj_id_cat[obj_id] == cat: + valid_cat_ids.append(obj_id) + valid_obj_ids[cat] = valid_cat_ids + + return vid_id, all_captions, valid_obj_ids + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json") + parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json") + + args = parser.parse_args() + + #==================데이터 불러오기=================== + # 전체 데이터셋 + train_dataset = build_ytvos_ref(image_set = 'train', args = args) + + # 전체 데이터셋 메타데이터 + metas = train_dataset.metas + + # 색상 후보 8개 (RGB 형식) + colors = [ + (255, 0, 0), # Red + (0, 255, 0), # Green + (0, 0, 255), # Blue + (255, 255, 0), # Yellow + (255, 0, 255), # Magenta + (0, 255, 255), # Cyan + (128, 0, 128), # Purple + (255, 165, 0) # Orange + ] + + ytvos_category_valid_list = [ + 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', + 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', + 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', + 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', + 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake', + 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra' + ] + + #==================gpt 돌리기=================== + os.environ['OPENAI_API_KEY'] = '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' + + result_captions = {} + result_valid_obj_ids = {} + + for i in range(len(metas)): + try: + vid_id, all_captions, valid_obj_ids = getCaption(i) + + if vid_id not in result_captions: + result_captions[vid_id] = all_captions + if vid_id not in result_valid_obj_ids: + result_valid_obj_ids[vid_id] = valid_obj_ids + + except (requests.exceptions.ConnectionError, APIConnectionError) as e: + print(f"created caption until {i}", flush=True) + + with open(args.save_caption_path, "w") as file: + json.dump(result_captions, file, indent=4) + + with open(args.save_valid_obj_ids_path, "w") as file: + json.dump(result_valid_obj_ids, file, indent=4) + + print("Finished!", flush=True) + + with open(args.save_caption_path, "w") as file: + json.dump(result_captions, file, indent=4) + + with open(args.save_valid_obj_ids_path, "w") as file: + json.dump(result_valid_obj_ids, file, indent=4) diff --git a/.history/mbench/make_ref-ytvos_json_20250113182322.py b/.history/mbench/make_ref-ytvos_json_20250113182322.py new file mode 100644 index 0000000000000000000000000000000000000000..7dda79faa25d68f38e673bb8632fe5549671aa88 --- /dev/null +++ b/.history/mbench/make_ref-ytvos_json_20250113182322.py @@ -0,0 +1,100 @@ +from datasets import build_dataset +import argparse +import opts + +import sys +from pathlib import Path +import os +from os import path as osp +import io + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +#==================json 만들기=================== +def createJson(train_dataset, metas): + entire_json = {} + + #초기화 + data_idx = 0 + + while data_idx < 10: + + #하나의 비디오에 대해 + video_data = {} + video_id = metas[data_idx]['video'] + video_data['bins'] = metas[data_idx]['bins'] + annotation_data = [] + frame_names = [] + + while metas[data_idx]['video'] == video_id: + + obj_id = metas[data_idx]['obj_id'] + sample_id = metas[data_idx]['sample_id'] + sample_frames_id = metas[data_idx]['sample_frames_id'] + sample_frame_idx = sample_frames_id.index(sample_id) + + frames = metas[data_idx]['frames'] + + frame_name = frames[sample_id] + cat_name = metas[data_idx]['category'] + + bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :] + + obj_data = {obj_id: { + "category_name" : cat_name, + "bbox": bbox + }} + + + annotation_data.append(obj_data) + + frame_names.append(frame_name) + + data_idx += 1 + + video_data['annotations'] = annotation_data + video_data['frame_names'] = frame_names + video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id) + + entire_json[video_id] = video_data + + return entire_json + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + args = parser.parse_args() + + #==================데이터 불러오기=================== + # 전체 데이터셋 + train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args) + + # 전체 데이터셋 메타데이터 + metas = train_dataset.metas + + #==================json 만들기=================== + entire_json_dict = createJson(train_dataset, metas) + entire_json = json.dumps(entire_json_dict, indent=4) + + with open('mbench/sampled_frame.json', mode='w') as file: + file.write(entire_json) diff --git a/.history/mbench/make_ref-ytvos_json_20250113182734.py b/.history/mbench/make_ref-ytvos_json_20250113182734.py new file mode 100644 index 0000000000000000000000000000000000000000..d460275ab038cc8b9d9087e1e3595de21ef69a14 --- /dev/null +++ b/.history/mbench/make_ref-ytvos_json_20250113182734.py @@ -0,0 +1,102 @@ +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from datasets import build_dataset +import argparse +import opts + +import sys +from pathlib import Path +import os +from os import path as osp +import io + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +#==================json 만들기=================== +def createJson(train_dataset, metas): + entire_json = {} + + #초기화 + data_idx = 0 + + while data_idx < 10: + + #하나의 비디오에 대해 + video_data = {} + video_id = metas[data_idx]['video'] + video_data['bins'] = metas[data_idx]['bins'] + annotation_data = [] + frame_names = [] + + while metas[data_idx]['video'] == video_id: + + obj_id = metas[data_idx]['obj_id'] + sample_id = metas[data_idx]['sample_id'] + sample_frames_id = metas[data_idx]['sample_frames_id'] + sample_frame_idx = sample_frames_id.index(sample_id) + + frames = metas[data_idx]['frames'] + + frame_name = frames[sample_id] + cat_name = metas[data_idx]['category'] + + bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :] + + obj_data = {obj_id: { + "category_name" : cat_name, + "bbox": bbox + }} + + + annotation_data.append(obj_data) + + frame_names.append(frame_name) + + data_idx += 1 + + video_data['annotations'] = annotation_data + video_data['frame_names'] = frame_names + video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id) + + entire_json[video_id] = video_data + + return entire_json + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + args = parser.parse_args() + + #==================데이터 불러오기=================== + # 전체 데이터셋 + train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args) + + # 전체 데이터셋 메타데이터 + metas = train_dataset.metas + + #==================json 만들기=================== + entire_json_dict = createJson(train_dataset, metas) + entire_json = json.dumps(entire_json_dict, indent=4) + + with open('mbench/sampled_frame.json', mode='w') as file: + file.write(entire_json) diff --git a/.history/mbench/make_ref-ytvos_json_20250113182817.py b/.history/mbench/make_ref-ytvos_json_20250113182817.py new file mode 100644 index 0000000000000000000000000000000000000000..5675fde75aad78185c0398149d2800b28879cde6 --- /dev/null +++ b/.history/mbench/make_ref-ytvos_json_20250113182817.py @@ -0,0 +1,103 @@ +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from datasets import build_dataset +import argparse +import opts + + +from pathlib import Path +import os +from os import path as osp +import io + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +#==================json 만들기=================== +def createJson(train_dataset, metas): + entire_json = {} + + #초기화 + data_idx = 0 + + while data_idx < 10: + + #하나의 비디오에 대해 + video_data = {} + video_id = metas[data_idx]['video'] + video_data['bins'] = metas[data_idx]['bins'] + annotation_data = [] + frame_names = [] + + while metas[data_idx]['video'] == video_id: + + obj_id = metas[data_idx]['obj_id'] + sample_id = metas[data_idx]['sample_id'] + sample_frames_id = metas[data_idx]['sample_frames_id'] + sample_frame_idx = sample_frames_id.index(sample_id) + + frames = metas[data_idx]['frames'] + + frame_name = frames[sample_id] + cat_name = metas[data_idx]['category'] + + bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :] + + obj_data = {obj_id: { + "category_name" : cat_name, + "bbox": bbox + }} + + + annotation_data.append(obj_data) + + frame_names.append(frame_name) + + data_idx += 1 + + video_data['annotations'] = annotation_data + video_data['frame_names'] = frame_names + video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id) + + entire_json[video_id] = video_data + + return entire_json + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + args = parser.parse_args() + + #==================데이터 불러오기=================== + # 전체 데이터셋 + train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args) + + # 전체 데이터셋 메타데이터 + metas = train_dataset.metas + + #==================json 만들기=================== + entire_json_dict = createJson(train_dataset, metas) + entire_json = json.dumps(entire_json_dict, indent=4) + + with open('mbench/sampled_frame.json', mode='w') as file: + file.write(entire_json) diff --git a/.history/mbench/make_ref-ytvos_json_20250113182842.py b/.history/mbench/make_ref-ytvos_json_20250113182842.py new file mode 100644 index 0000000000000000000000000000000000000000..1cdf04b6312f4ae2bda1f420a07d3a0b3de62aef --- /dev/null +++ b/.history/mbench/make_ref-ytvos_json_20250113182842.py @@ -0,0 +1,102 @@ +import sys +from os import path as osp +sys.path.append(os.path.abspath(osp.join(osp.dirname(__file__), '..'))) + +from datasets import build_dataset +import argparse +import opts + + +from pathlib import Path +import io + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +#==================json 만들기=================== +def createJson(train_dataset, metas): + entire_json = {} + + #초기화 + data_idx = 0 + + while data_idx < 10: + + #하나의 비디오에 대해 + video_data = {} + video_id = metas[data_idx]['video'] + video_data['bins'] = metas[data_idx]['bins'] + annotation_data = [] + frame_names = [] + + while metas[data_idx]['video'] == video_id: + + obj_id = metas[data_idx]['obj_id'] + sample_id = metas[data_idx]['sample_id'] + sample_frames_id = metas[data_idx]['sample_frames_id'] + sample_frame_idx = sample_frames_id.index(sample_id) + + frames = metas[data_idx]['frames'] + + frame_name = frames[sample_id] + cat_name = metas[data_idx]['category'] + + bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :] + + obj_data = {obj_id: { + "category_name" : cat_name, + "bbox": bbox + }} + + + annotation_data.append(obj_data) + + frame_names.append(frame_name) + + data_idx += 1 + + video_data['annotations'] = annotation_data + video_data['frame_names'] = frame_names + video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id) + + entire_json[video_id] = video_data + + return entire_json + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + args = parser.parse_args() + + #==================데이터 불러오기=================== + # 전체 데이터셋 + train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args) + + # 전체 데이터셋 메타데이터 + metas = train_dataset.metas + + #==================json 만들기=================== + entire_json_dict = createJson(train_dataset, metas) + entire_json = json.dumps(entire_json_dict, indent=4) + + with open('mbench/sampled_frame.json', mode='w') as file: + file.write(entire_json) diff --git a/.history/mbench/make_ref-ytvos_json_20250113183130.py b/.history/mbench/make_ref-ytvos_json_20250113183130.py new file mode 100644 index 0000000000000000000000000000000000000000..5123a82c73aa5225d9422c1669e829d11ee28206 --- /dev/null +++ b/.history/mbench/make_ref-ytvos_json_20250113183130.py @@ -0,0 +1,102 @@ +import sys +from os import path as osp +sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..'))) + +from datasets import build_dataset +import argparse +import opts + + +from pathlib import Path +import io + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +#==================json 만들기=================== +def createJson(train_dataset, metas): + entire_json = {} + + #초기화 + data_idx = 0 + + while data_idx < 10: + + #하나의 비디오에 대해 + video_data = {} + video_id = metas[data_idx]['video'] + video_data['bins'] = metas[data_idx]['bins'] + annotation_data = [] + frame_names = [] + + while metas[data_idx]['video'] == video_id: + + obj_id = metas[data_idx]['obj_id'] + sample_id = metas[data_idx]['sample_id'] + sample_frames_id = metas[data_idx]['sample_frames_id'] + sample_frame_idx = sample_frames_id.index(sample_id) + + frames = metas[data_idx]['frames'] + + frame_name = frames[sample_id] + cat_name = metas[data_idx]['category'] + + bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :] + + obj_data = {obj_id: { + "category_name" : cat_name, + "bbox": bbox + }} + + + annotation_data.append(obj_data) + + frame_names.append(frame_name) + + data_idx += 1 + + video_data['annotations'] = annotation_data + video_data['frame_names'] = frame_names + video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id) + + entire_json[video_id] = video_data + + return entire_json + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + args = parser.parse_args() + + #==================데이터 불러오기=================== + # 전체 데이터셋 + train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args) + + # 전체 데이터셋 메타데이터 + metas = train_dataset.metas + + #==================json 만들기=================== + entire_json_dict = createJson(train_dataset, metas) + entire_json = json.dumps(entire_json_dict, indent=4) + + with open('mbench/sampled_frame.json', mode='w') as file: + file.write(entire_json) diff --git a/.history/mbench/make_ref-ytvos_json_20250116141513.py b/.history/mbench/make_ref-ytvos_json_20250116141513.py new file mode 100644 index 0000000000000000000000000000000000000000..b1559a3ec3cd2fc53029a482ee09def964606ed6 --- /dev/null +++ b/.history/mbench/make_ref-ytvos_json_20250116141513.py @@ -0,0 +1,103 @@ +import sys +from os import path as osp +sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..'))) + +from datasets import build_dataset +import argparse +import opts + + +from pathlib import Path +import io + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +#==================json 만들기=================== +def createJson(train_dataset, metas): + entire_json = {} + + #초기화 + vid_idx = 0 + + while vid_idx < len(train_dataset): + + #하나의 비디오에 대해 + video_data = {} + video_train_frames, video_train_info = train_dataset[vid_idx] + video_meta = metas[vid_idx] + + video_id = video_meta['video'] + video_data['bins'] = video_meta['bins'] + bin_nums = len(video_meta['bins']) + obj_nums = len(list(video_meta['obj_id_cat'].keys())) + + annotation_data = [] + frame_names = [] + + for i in range(bin_nums): + bin_data = {} + for j in range(obj_nums): + obj_id = str(j+1) + obj_data = { + "category_name":video_meta['obj_id_cat'][obj_id], + "bbox":video_train_info['boxes'][i*obj_nums+j, :] + } + bin_data[obj_id] = obj_data + annotation_data.append(bin_data) + + video_data['annotations'] = annotation_data + + + sample_indx = metas[vid_idx]['sample_indx'] + frames = metas[vid_idx]['frames'] + for i in sample_indx: + frame_name = frames[i] + frame_names.append(frame_name) + + video_data['frame_names'] = frame_names + video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id) + entire_json[video_id] = video_data + + vid_idx += 1 + + return entire_json + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + args = parser.parse_args() + + #==================데이터 불러오기=================== + # 전체 데이터셋 + train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args) + + # 전체 데이터셋 메타데이터 + metas = train_dataset.metas + + #==================json 만들기=================== + entire_json_dict = createJson(train_dataset, metas) + print(type(entire_json_dict)) + entire_json = json.dumps(entire_json_dict, indent=4) + + with open('mbench/sampled_frame.json', mode='w') as file: + file.write(entire_json) diff --git a/.history/mbench/make_ref-ytvos_json_20250118024325.py b/.history/mbench/make_ref-ytvos_json_20250118024325.py new file mode 100644 index 0000000000000000000000000000000000000000..f6cbcf7783a5fa3895fce884c8cf62de45c44b12 --- /dev/null +++ b/.history/mbench/make_ref-ytvos_json_20250118024325.py @@ -0,0 +1,108 @@ +import sys +import os +from os import path as osp +sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..'))) + +from datasets import build_dataset +import argparse +import opts + + +from pathlib import Path +import io + +import numpy as np +import pandas as pd +import regex as re +import json + +import cv2 +from PIL import Image, ImageDraw +import torch +from torchvision.transforms import functional as F + +from skimage import measure # (pip install scikit-image) +from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely) + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.collections import PatchCollection +from matplotlib.patches import Rectangle + + +import ipywidgets as widgets +from IPython.display import display, clear_output + +#==================json 만들기=================== +def createJson(train_dataset, metas): + entire_json = {} + + #초기화 + vid_idx = 0 + + while vid_idx < len(train_dataset): + + #하나의 비디오에 대해 + video_data = {} + video_train_frames, video_train_info = train_dataset[vid_idx] + video_meta = metas[vid_idx] + + video_id = video_meta['video'] + video_data['bins'] = video_meta['bins'] + bin_nums = len(video_meta['bins']) + obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())]) + + annotation_data = [] + frame_names = [] + + for i in range(bin_nums): + bin_data = {} + for j in range(obj_nums): + obj_id = str(j+1) + try: + obj_data = { + "category_name":video_meta['obj_id_cat'][obj_id], + "bbox":video_train_info['boxes'][i*obj_nums+j, :].tolist(), + "valid":video_train_info['valid'][i*obj_nums+j].item() + } + except: + obj_data = {} + bin_data[obj_id] = obj_data + annotation_data.append(bin_data) + + video_data['annotations'] = annotation_data + + + sample_indx = metas[vid_idx]['sample_indx'] + frames = metas[vid_idx]['frames'] + for i in sample_indx: + frame_name = frames[i] + frame_names.append(frame_name) + + video_data['frame_names'] = frame_names + video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id) + entire_json[video_id] = video_data + + vid_idx += 1 + + return entire_json + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + args = parser.parse_args() + + #==================데이터 불러오기=================== + # 전체 데이터셋 + train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args) + + # 전체 데이터셋 메타데이터 + metas = train_dataset.metas + + #==================json 만들기=================== + entire_json_dict = createJson(train_dataset, metas) + print(type(entire_json_dict)) + entire_json = json.dumps(entire_json_dict, indent=4) + + with open('mbench/sampled_frame2.json', mode='w') as file: + file.write(entire_json) diff --git a/.history/mbench/ytvos_ref_20250121152309.py b/.history/mbench/ytvos_ref_20250121152309.py new file mode 100644 index 0000000000000000000000000000000000000000..9c03ce2c423582837ca12f06dc7b5f3ef6696725 --- /dev/null +++ b/.history/mbench/ytvos_ref_20250121152309.py @@ -0,0 +1,264 @@ +""" +Ref-YoutubeVOS data loader +""" +from pathlib import Path + +import torch +from torch.utils.data import Dataset + +import os +from PIL import Image +import json +import numpy as np +import random + +# from datasets.categories import ytvos_category_dict as category_dict + + +category_dict = { + 'airplane': 0, 'ape': 1, 'bear': 2, 'bike': 3, 'bird': 4, 'boat': 5, 'bucket': 6, 'bus': 7, 'camel': 8, 'cat': 9, + 'cow': 10, 'crocodile': 11, 'deer': 12, 'dog': 13, 'dolphin': 14, 'duck': 15, 'eagle': 16, 'earless_seal': 17, + 'elephant': 18, 'fish': 19, 'fox': 20, 'frisbee': 21, 'frog': 22, 'giant_panda': 23, 'giraffe': 24, 'hand': 25, + 'hat': 26, 'hedgehog': 27, 'horse': 28, 'knife': 29, 'leopard': 30, 'lion': 31, 'lizard': 32, 'monkey': 33, + 'motorbike': 34, 'mouse': 35, 'others': 36, 'owl': 37, 'paddle': 38, 'parachute': 39, 'parrot': 40, 'penguin': 41, + 'person': 42, 'plant': 43, 'rabbit': 44, 'raccoon': 45, 'sedan': 46, 'shark': 47, 'sheep': 48, 'sign': 49, + 'skateboard': 50, 'snail': 51, 'snake': 52, 'snowboard': 53, 'squirrel': 54, 'surfboard': 55, 'tennis_racket': 56, + 'tiger': 57, 'toilet': 58, 'train': 59, 'truck': 60, 'turtle': 61, 'umbrella': 62, 'whale': 63, 'zebra': 64 +} + + + +class YTVOSDataset(Dataset): + """ + A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper: + "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark" + (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf). + The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first + dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download + through the Youtube-VOS referring video object segmentation competition page at: + https://competitions.codalab.org/competitions/29139 + Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into + two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can + currently only be done on the competition 'validation' subset using the competition's server, as + annotations were publicly released only for the 'train' subset of the competition. + + """ + def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, + num_frames: int, max_skip: int): + self.img_folder = img_folder + self.ann_file = ann_file + self._transforms = transforms + self.return_masks = return_masks # not used + self.num_frames = num_frames + self.max_skip = max_skip + # create video meta data + self.prepare_metas() + + print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas)) + print('\n') + + def prepare_metas(self): + # read object information + with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f: + subset_metas_by_video = json.load(f)['videos'] + + # read expression data + with open(str(self.ann_file), 'r') as f: + subset_expressions_by_video = json.load(f)['videos'] + self.videos = list(subset_expressions_by_video.keys()) + + self.metas = [] + skip_vid_count = 0 + + for vid in self.videos: + vid_meta = subset_metas_by_video[vid] + vid_data = subset_expressions_by_video[vid] + vid_frames = sorted(vid_data['frames']) + vid_len = len(vid_frames) + + if vid_len < 11: + #print(f"Too short video: {vid} with frame length {vid_len}") + skip_vid_count += 1 + continue + + + # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2) + start_idx , end_idx = 2, vid_len-2 + bin_size = (end_idx - start_idx) // 4 + + bins = [] + for i in range(4): + bin_start = start_idx + i * bin_size + bin_end = bin_start + bin_size if i < 3 else end_idx + + bins.append((bin_start, bin_end)) + + # Random sample one frame from each bin + sample_indx = [] + for start_idx, end_idx in bins: + sample_indx.append(random.randint(start_idx, end_idx - 1)) + sample_indx.sort() # Ensure indices are in order + + + meta = { + 'video':vid, + 'sample_indx':sample_indx, + 'bins':bins, + 'frames':vid_frames + } + obj_id_cat = {} + for exp_id, exp_dict in vid_data['expressions'].items(): + obj_id = exp_dict['obj_id'] + if obj_id not in obj_id_cat: + obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category'] + meta['obj_id_cat'] = obj_id_cat + self.metas.append(meta) + + print(f"skipped {skip_vid_count} short videos") + + + @staticmethod + def bounding_box(img): + rows = np.any(img, axis=1) + cols = np.any(img, axis=0) + rmin, rmax = np.where(rows)[0][[0, -1]] + cmin, cmax = np.where(cols)[0][[0, -1]] + return rmin, rmax, cmin, cmax # y1, y2, x1, x2 + + def __len__(self): + return len(self.metas) + + def __getitem__(self, idx): + meta = self.metas[idx] # dict + + video, sample_indx, bins, frames, obj_id_cat = \ + meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat'] + + # read frames and masks + annos = {} + imgs, labels, boxes, masks, valid = [], [], [], [], [] + for frame_indx in sample_indx: + frame_name = frames[frame_indx] + img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg') + mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png') + img = Image.open(img_path).convert('RGB') + imgs.append(img) + + mask = Image.open(mask_path).convert('P') + mask = np.array(mask) + + frame_annotations = {} + + # create the target + for obj_id in list(obj_id_cat.keys()): + obj_mask = (mask==int(obj_id)).astype(np.float32) # 0,1 binary + if (obj_mask > 0).any(): + y1, y2, x1, x2 = self.bounding_box(obj_mask) + box = torch.tensor([x1, y1, x2, y2]).to(torch.float) + valid.append(1) + val = 1 + else: # some frame didn't contain the instance + box = torch.tensor([0, 0, 0, 0]).to(torch.float) + valid.append(0) + val = 0 + obj_mask = torch.from_numpy(obj_mask) + + # append + masks.append(obj_mask) + boxes.append(box) + + frame_annotations[obj_id] = { + 'category_name': obj_id_cat[obj_id], + 'bbox': box, + 'valid' : val, + 'mask': obj_mask + } + + annos[frame_indx] = frame_annotations + + + # transform + w, h = img.size + boxes = torch.stack(boxes, dim=0) + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + masks = torch.stack(masks, dim=0) + target = { + 'frames_idx': sample_indx, # [T,] + 'boxes': boxes, # [T, 4], xyxy + 'masks': masks, # [T, H, W] + 'valid': torch.tensor(valid), # [T,] + 'obj_ids' : list(obj_id_cat.keys()), + 'orig_size': torch.as_tensor([int(h), int(w)]), + 'size': torch.as_tensor([int(h), int(w)]) + } + + # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform + # if self._transforms: + # imgs, target = self._transforms(imgs, target) + # imgs = torch.stack(imgs, dim=0) # [T, 3, H, W] + # else: + imgs = np.array(imgs) + imgs = torch.tensor(imgs.transpose(0, 3, 1, 2)) + + + # # FIXME: handle "valid", since some box may be removed due to random crop + # if torch.any(target['valid'] == 1): # at leatst one instance + # instance_check = True + # else: + # idx = random.randint(0, self.__len__() - 1) + + return imgs, target, annos + + +def make_coco_transforms(image_set, max_size=640): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [288, 320, 352, 392, 416, 448, 480, 512] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.PhotometricDistort(), + T.RandomSelect( + T.Compose([ + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]) + ), + normalize, + ]) + + # we do not use the 'val' set since the annotations are inaccessible + if image_set == 'val': + return T.Compose([ + T.RandomResize([360], max_size=640), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.ytvos_path) + assert root.exists(), f'provided YTVOS path {root} does not exist' + PATHS = { + "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"), + "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually + } + img_folder, ann_file = PATHS[image_set] + # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, + # num_frames=args.num_frames, max_skip=args.max_skip) + dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, + num_frames=args.num_frames, max_skip=args.max_skip) + return dataset + diff --git a/.history/mbench_a2d/gpt_a2d_numbered_20250205111640.py b/.history/mbench_a2d/gpt_a2d_numbered_20250205111640.py new file mode 100644 index 0000000000000000000000000000000000000000..2f2b93c1aae931e5d7f2fcea318f2ddc7de47ea2 --- /dev/null +++ b/.history/mbench_a2d/gpt_a2d_numbered_20250205111640.py @@ -0,0 +1,82 @@ +from datasets import build_dataset +import argparse +import opts + +import sys +import os +import time + +import numpy as np +import matplotlib.pyplot as plt +import cv2 +from io import BytesIO +import base64 +from PIL import Image + +from openai import OpenAI + +def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False): + #마스크 색칠할지 + if color_mask == True: + alpha = 0.1 + + colored_mask = np.zeros_like(frame) + colored_mask[mask == 1] = [255, 0, 0] + frame[mask == 1] = ( + (1 - alpha) * frame[mask == 1] + + alpha * colored_mask[mask == 1] + ) + + #마스크 아웃라인 그리기 + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, [255, 0, 0], 2) + + #instance_id 적을지 + if label_number == True: + if len(contours) > 0: + largest_contour = max(contours, key=cv2.contourArea) + M = cv2.moments(largest_contour) + if M["m00"] != 0: + center_x = int(M["m10"] / M["m00"]) + center_y = int(M["m01"] / M["m00"]) + else: + center_x, center_y = 0, 0 + + font = cv2.FONT_HERSHEY_SIMPLEX + text = str(instance_id) + font_scale = 0.6 + text_size = cv2.getTextSize(text, font, font_scale, 2)[0] + text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심 + text_y = center_y + # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심 + + # 텍스트 배경 사각형 좌표 계산 + rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단 + # rect_end = (text_x + text_size[0] + 5, text_y + 5) + rect_end = (text_x + text_size[0] + 5, text_y) + + cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) + cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2) + + # plt.figure(figsize=(6, 10)) + # plt.imshow(frame) + # plt.title(text_query) + # plt.tight_layout() + # plt.axis('off') + # plt.show() + + buffer = BytesIO() + frame = Image.fromarray(frame) + frame.save(buffer, format='jpeg') + buffer.seek(0) + encoded_frame = base64.b64encode(buffer.read()).decode("utf-8") + + return encoded_frame + + +if __name__ == "__main__": + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + args = parser.parse_args() + + train_dataset = build_dataset('a2d', image_set = 'train', args = args) + text_annotations = train_dataset.text_annotations \ No newline at end of file diff --git a/.history/mbench_a2d/gpt_a2d_numbered_20250205122340.py b/.history/mbench_a2d/gpt_a2d_numbered_20250205122340.py new file mode 100644 index 0000000000000000000000000000000000000000..30f5a49a52cd7cf1d026191764a2da47bf509ebd --- /dev/null +++ b/.history/mbench_a2d/gpt_a2d_numbered_20250205122340.py @@ -0,0 +1,196 @@ +from datasets import build_dataset +import argparse +import opts + +import sys +import os +import time + +import numpy as np +import matplotlib.pyplot as plt +import cv2 +from io import BytesIO +import base64 +from PIL import Image +import json + +from openai import OpenAI + +def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False): + #마스크 색칠할지 + if color_mask == True: + alpha = 0.1 + + colored_mask = np.zeros_like(frame) + colored_mask[mask == 1] = [255, 0, 0] + frame[mask == 1] = ( + (1 - alpha) * frame[mask == 1] + + alpha * colored_mask[mask == 1] + ) + + #마스크 아웃라인 그리기 + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, [255, 0, 0], 2) + + #instance_id 적을지 + if label_number == True: + if len(contours) > 0: + largest_contour = max(contours, key=cv2.contourArea) + M = cv2.moments(largest_contour) + if M["m00"] != 0: + center_x = int(M["m10"] / M["m00"]) + center_y = int(M["m01"] / M["m00"]) + else: + center_x, center_y = 0, 0 + + font = cv2.FONT_HERSHEY_SIMPLEX + text = str(instance_id) + font_scale = 0.6 + text_size = cv2.getTextSize(text, font, font_scale, 2)[0] + text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심 + text_y = center_y + # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심 + + # 텍스트 배경 사각형 좌표 계산 + rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단 + # rect_end = (text_x + text_size[0] + 5, text_y + 5) + rect_end = (text_x + text_size[0] + 5, text_y) + + cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) + cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2) + + # plt.figure(figsize=(6, 10)) + # plt.imshow(frame) + # plt.title(text_query) + # plt.tight_layout() + # plt.axis('off') + # plt.show() + + buffer = BytesIO() + frame = Image.fromarray(frame) + frame.save(buffer, format='jpeg') + buffer.seek(0) + encoded_frame = base64.b64encode(buffer.read()).decode("utf-8") + + return encoded_frame + +def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True): + + base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number) + + captioner = OpenAI() + + #필터링하지 않고 바로 ref exp 만들기 + dense_caption_prompt = f""" + You are a visual assistant analyzing a single frame of a video. + In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary. + I also give you a text query describing the marked object. + I want to use your expression to create an **action-centric referring expression** dataset. + Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions + --- + ## Guidelines: + 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object). + 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head"). + 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”). + 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button"). + 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction. + 6. If there are multiple objects, ensure the description for the marked object **differentiates** its action. + 7. Base your description on these action definitions: + - Avoid using term 'minimal' or 'slightly'. + - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back") + - details such as motion and intention, facial with object manipulation + - movements with object or other entities when they are prominent and observable. expression should be specific. + (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X)) + -- + ## Output Format: + - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format : + object id. action-oriented description + (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.) + ### Example + If the frame has 1 labeled bear, your output should look like: + 1. the bear reaching his right arm while leaning forward to capture the prey + --- + **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”). + **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed). + **Do not include markdown** in the output. + Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). + For each labeled object, output referring expressions for each object id. + """ + prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}" + + MAX_RETRIES = 2 + retry_count = 0 + + while retry_count < MAX_RETRIES: + response = captioner.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": prompt_with_text_query, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + + + caption = response.choices[0].message.content.strip() + caption_lower = caption.lower().lstrip() + if caption_lower.startswith("1.") and not any( + phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"] + ): + break + print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})") + retry_count += 1 + time.sleep(2) + + if retry_count == MAX_RETRIES: + caption = None + print("Max retries reached. Caption generation failed.") + + else: + caption = None + + return caption + +if __name__ == "__main__": + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json') + args = parser.parse_args() + + train_dataset = build_dataset('a2d', image_set = 'train', args = args) + text_annotations = train_dataset.text_annotations + + all_captions = {} + + os.environ['OPENAI_API_KEY'] = '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' + + for idx in range(100): + imgs, target = train_dataset[idx] + frames_idx = target['frames_idx'].tolist() + text_query, vid_id, frame_id, instance_id = text_annotations[idx] + + frame_id = frame_id - 1 + frame_order = frames_idx.index(frame_id) + + frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy() + mask = target['masks'].numpy().astype(np.uint8).squeeze() + + caption = getCaption(frame, mask, instance_id, text_query) + if vid_id not in all_captions: + all_captions[vid_id] = {frame_id : caption} + else: + all_captions[vid_id][frame_id] = caption + + + with open(args.save_caption_path, 'w') as file: + json.dump(all_captions, file, indent=4) + diff --git a/.history/mbench_a2d/gpt_a2d_numbered_20250205152326.py b/.history/mbench_a2d/gpt_a2d_numbered_20250205152326.py new file mode 100644 index 0000000000000000000000000000000000000000..077150c0b8dbc312dfdc7335e334720d0caef8e9 --- /dev/null +++ b/.history/mbench_a2d/gpt_a2d_numbered_20250205152326.py @@ -0,0 +1,200 @@ +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from datasets import build_dataset +import argparse +import opts +import time + +import numpy as np +import matplotlib.pyplot as plt +import cv2 +from io import BytesIO +import base64 +from PIL import Image +import json + +from openai import OpenAI + +def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False): + #마스크 색칠할지 + if color_mask == True: + alpha = 0.1 + + colored_mask = np.zeros_like(frame) + colored_mask[mask == 1] = [255, 0, 0] + frame[mask == 1] = ( + (1 - alpha) * frame[mask == 1] + + alpha * colored_mask[mask == 1] + ) + + #마스크 아웃라인 그리기 + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, [255, 0, 0], 2) + + #instance_id 적을지 + if label_number == True: + if len(contours) > 0: + largest_contour = max(contours, key=cv2.contourArea) + M = cv2.moments(largest_contour) + if M["m00"] != 0: + center_x = int(M["m10"] / M["m00"]) + center_y = int(M["m01"] / M["m00"]) + else: + center_x, center_y = 0, 0 + + font = cv2.FONT_HERSHEY_SIMPLEX + text = str(instance_id) + font_scale = 0.6 + text_size = cv2.getTextSize(text, font, font_scale, 2)[0] + text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심 + text_y = center_y + # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심 + + # 텍스트 배경 사각형 좌표 계산 + rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단 + # rect_end = (text_x + text_size[0] + 5, text_y + 5) + rect_end = (text_x + text_size[0] + 5, text_y) + + cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) + cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2) + + # plt.figure(figsize=(6, 10)) + # plt.imshow(frame) + # plt.title(text_query) + # plt.tight_layout() + # plt.axis('off') + # plt.show() + + buffer = BytesIO() + frame = Image.fromarray(frame) + frame.save(buffer, format='jpeg') + buffer.seek(0) + encoded_frame = base64.b64encode(buffer.read()).decode("utf-8") + + return encoded_frame + +def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True): + + base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number) + + captioner = OpenAI() + + #필터링하지 않고 바로 ref exp 만들기 + dense_caption_prompt = f""" + You are a visual assistant analyzing a single frame of a video. + In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary. + I also give you a text query describing the marked object. + I want to use your expression to create an **action-centric referring expression** dataset. + Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions + --- + ## Guidelines: + 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object). + 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head"). + 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”). + 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button"). + 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction. + 6. If there are multiple objects, ensure the description for the marked object **differentiates** its action. + 7. Base your description on these action definitions: + - Avoid using term 'minimal' or 'slightly'. + - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back") + - details such as motion and intention, facial with object manipulation + - movements with object or other entities when they are prominent and observable. expression should be specific. + (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X)) + -- + ## Output Format: + - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format : + object id. action-oriented description + (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.) + ### Example + If the frame has 1 labeled bear, your output should look like: + 1. the bear reaching his right arm while leaning forward to capture the prey + --- + **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”). + **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed). + **Do not include markdown** in the output. + Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). + For each labeled object, output referring expressions for each object id. + """ + prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}" + + MAX_RETRIES = 2 + retry_count = 0 + + while retry_count < MAX_RETRIES: + response = captioner.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": prompt_with_text_query, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + + + caption = response.choices[0].message.content.strip() + caption_lower = caption.lower().lstrip() + if caption_lower.startswith("1.") and not any( + phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"] + ): + break + print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})") + retry_count += 1 + time.sleep(2) + + if retry_count == MAX_RETRIES: + caption = None + print("Max retries reached. Caption generation failed.") + + else: + caption = None + + return caption + +if __name__ == "__main__": + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json') + args = parser.parse_args() + + train_dataset = build_dataset('a2d', image_set = 'train', args = args) + text_annotations = train_dataset.text_annotations + + all_captions = {} + + #os.environ['OPENAI_API_KEY'] = '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' + os.environ['OPENAI_API_KEY'] = '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' + + for idx in range(100): + imgs, target = train_dataset[idx] + frames_idx = target['frames_idx'].tolist() + text_query, vid_id, frame_id, instance_id = text_annotations[idx] + print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True) + + frame_id = frame_id - 1 + frame_order = frames_idx.index(frame_id) + + frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy() + mask = target['masks'].numpy().astype(np.uint8).squeeze() + + caption = getCaption(frame, mask, instance_id, text_query) + if vid_id not in all_captions: + all_captions[vid_id] = {frame_id : caption} + else: + all_captions[vid_id][frame_id] = caption + + print("Finished!", flush=True) + + with open(args.save_caption_path, 'w') as file: + json.dump(all_captions, file, indent=4) + diff --git a/.history/mbench_a2d/gpt_a2d_numbered_20250207110257.py b/.history/mbench_a2d/gpt_a2d_numbered_20250207110257.py new file mode 100644 index 0000000000000000000000000000000000000000..32811050ac4261c8752eb49187c25e547a742903 --- /dev/null +++ b/.history/mbench_a2d/gpt_a2d_numbered_20250207110257.py @@ -0,0 +1,213 @@ +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from datasets import build_dataset +import argparse +import opts +import time + +import numpy as np +import matplotlib.pyplot as plt +import cv2 +from io import BytesIO +import base64 +from PIL import Image +import json + +from openai import OpenAI + +def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False): + #마스크 색칠할지 + if color_mask == True: + alpha = 0.1 + + colored_mask = np.zeros_like(frame) + colored_mask[mask == 1] = [255, 0, 0] + frame[mask == 1] = ( + (1 - alpha) * frame[mask == 1] + + alpha * colored_mask[mask == 1] + ) + + #마스크 아웃라인 그리기 + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(frame, contours, -1, [255, 0, 0], 2) + + #instance_id 적을지 + if label_number == True: + if len(contours) > 0: + largest_contour = max(contours, key=cv2.contourArea) + M = cv2.moments(largest_contour) + if M["m00"] != 0: + center_x = int(M["m10"] / M["m00"]) + center_y = int(M["m01"] / M["m00"]) + else: + center_x, center_y = 0, 0 + + font = cv2.FONT_HERSHEY_SIMPLEX + text = str(instance_id) + font_scale = 0.6 + text_size = cv2.getTextSize(text, font, font_scale, 2)[0] + text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심 + text_y = center_y + # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심 + + # 텍스트 배경 사각형 좌표 계산 + rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단 + # rect_end = (text_x + text_size[0] + 5, text_y + 5) + rect_end = (text_x + text_size[0] + 5, text_y) + + cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) + cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2) + + # plt.figure(figsize=(6, 10)) + # plt.imshow(frame) + # plt.title(text_query) + # plt.tight_layout() + # plt.axis('off') + # plt.show() + + buffer = BytesIO() + frame = Image.fromarray(frame) + frame.save(buffer, format='jpeg') + buffer.seek(0) + encoded_frame = base64.b64encode(buffer.read()).decode("utf-8") + + return encoded_frame + +def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True): + + base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number) + + captioner = OpenAI() + + #필터링하지 않고 바로 ref exp 만들기 + dense_caption_prompt = f""" + You are a visual assistant analyzing a single frame of a video. + In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary. + I also give you a text query describing the marked object. + I want to use your expression to create an **action-centric referring expression** dataset. + Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions + --- + ## Guidelines: + 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object). + 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head"). + 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”). + 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button"). + 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction. + 6. If there are multiple objects, ensure the description for the marked object **differentiates** its action. + 7. Base your description on these action definitions: + - Avoid using term 'minimal' or 'slightly'. + - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back") + - details such as motion and intention, facial with object manipulation + - movements with object or other entities when they are prominent and observable. expression should be specific. + (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X)) + -- + ## Output Format: + - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format : + object id. action-oriented description + (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.) + ### Example + If the frame has 1 labeled bear, your output should look like: + 1. the bear reaching his right arm while leaning forward to capture the prey + --- + **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”). + **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed). + **Do not include markdown** in the output. + Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). + For each labeled object, output referring expressions for each object id. + """ + prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}" + + MAX_RETRIES = 2 + retry_count = 0 + + while retry_count < MAX_RETRIES: + response = captioner.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": prompt_with_text_query, + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, + }, + ], + } + ], + ) + + + caption = response.choices[0].message.content.strip() + caption_lower = caption.lower().lstrip() + if caption_lower.startswith("1.") and not any( + phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"] + ): + break + print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})") + retry_count += 1 + time.sleep(2) + + if retry_count == MAX_RETRIES: + caption = None + print("Max retries reached. Caption generation failed.") + + else: + caption = None + + return caption + +if __name__ == "__main__": + parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) + parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json') + args = parser.parse_args() + + train_dataset = build_dataset('a2d', image_set = 'train', args = args) + text_annotations = train_dataset.text_annotations + + all_captions = {} + + #os.environ['OPENAI_API_KEY'] = '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' + os.environ['OPENAI_API_KEY'] = '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' + + first_text_query = "" + for idx in range(300): + imgs, target = train_dataset[idx] + frames_idx = target['frames_idx'].tolist() + text_query, vid_id, frame_id, instance_id = text_annotations[idx] + + if text_query == first_text_query: + continue + + print(f"------------vid id: {vid_id}, frame id: {frame_id}, instance id: {instance_id}", flush=True) + + frame_id = frame_id - 1 + frame_order = frames_idx.index(frame_id) + + frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy() + mask = target['masks'].numpy().astype(np.uint8).squeeze() + + caption = getCaption(frame, mask, instance_id, text_query, model='gpt-4o-mini') + + if vid_id in all_captions: + if frame_id in all_captions[vid_id]: + all_captions[vid_id][frame_id][instance_id] = caption + else: + all_captions[vid_id][frame_id] = {instance_id : caption} + else: + all_captions[vid_id] = {frame_id : {instance_id: caption}} + + if idx % 50 == 0: + with open(args.save_caption_path, 'w') as file: + json.dump(all_captions, file, indent=4) + + print("Finished!", flush=True) + + with open(args.save_caption_path, 'w') as file: + json.dump(all_captions, file, indent=4) + diff --git a/.history/slurm_script/jupyter_20250121151552.sh b/.history/slurm_script/jupyter_20250121151552.sh new file mode 100644 index 0000000000000000000000000000000000000000..7f04e43ce8f6b2bb595d2acaa4aa23900c0e08d1 --- /dev/null +++ b/.history/slurm_script/jupyter_20250121151552.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +#SBATCH --job-name=jupyter +#SBATCH --partition=a5000 +#SBATCH --nodelist=node04 +#SBATCH --gres=gpu:1 +#SBATCH --time=14-00:00:00 +#SBATCH --mem=5G +#SBATCH --cpus-per-task=4 +#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out + + ml purge + ml load cuda/12.1 + eval "$(conda shell.bash hook)" + conda activate referformer + srun jupyter notebook --no-browser --port=7890 diff --git a/.history/slurm_script/jupyter_20250121151643.sh b/.history/slurm_script/jupyter_20250121151643.sh new file mode 100644 index 0000000000000000000000000000000000000000..8016d1cd5bbbde20ce08b458be6636042329d45a --- /dev/null +++ b/.history/slurm_script/jupyter_20250121151643.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +#SBATCH --job-name=jupyter +#SBATCH --partition=a4000 +#SBATCH --nodelist=node05 +#SBATCH --gres=gpu:1 +#SBATCH --time=14-00:00:00 +#SBATCH --mem=5G +#SBATCH --cpus-per-task=4 +#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out + + ml purge + ml load cuda/12.1 + eval "$(conda shell.bash hook)" + conda activate referformer + srun jupyter notebook --no-browser --port=7890 diff --git a/.history/slurm_script/mbench_gpt_a2d_20250205122515.sh b/.history/slurm_script/mbench_gpt_a2d_20250205122515.sh new file mode 100644 index 0000000000000000000000000000000000000000..272f6b2debfaaf173a3b18e43a41175b6c21e42f --- /dev/null +++ b/.history/slurm_script/mbench_gpt_a2d_20250205122515.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +#SBATCH --job-name=mbench_gpt_a2d +#SBATCH --partition=a4000 +#SBATCH --nodelist=node05 +#SBATCH --gres=gpu:1 +#SBATCH --time=14-00:00:00 +#SBATCH --mem=5G +#SBATCH --cpus-per-task=4 +#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_a2d.out + cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer + + ml purge + ml load cuda/12.1 + eval "$(conda shell.bash hook)" + conda activate referformer + + python3 mbench/gpt_ref-ytvos_numbered_cy.py \ + --save_caption_path mbench_a2d/numbered_captions.json \ No newline at end of file diff --git a/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121155940.sh b/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121155940.sh new file mode 100644 index 0000000000000000000000000000000000000000..700e8cd581fa8bd7ad478f24dd1a331dca4826d1 --- /dev/null +++ b/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121155940.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +#SBATCH --job-name=mbench_gpt_ref-ytvos_revised +#SBATCH --partition=a5000 +#SBATCH --nodelist=node04 +#SBATCH --gres=gpu:1 +#SBATCH --time=14-00:00:00 +#SBATCH --mem=5G +#SBATCH --cpus-per-task=4 +#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_revised.out + cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer + + ml purge + ml load cuda/12.1 + eval "$(conda shell.bash hook)" + conda activate referformer + + python3 mbench/gpt_ref-ytvos_revised.py \ No newline at end of file diff --git a/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121160841.sh b/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121160841.sh new file mode 100644 index 0000000000000000000000000000000000000000..a1138085006d50d5ac38ab1697dbe9387c27a87c --- /dev/null +++ b/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121160841.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +#SBATCH --job-name=mbench_gpt_ref-ytvos_revised50 +#SBATCH --partition=a5000 +#SBATCH --nodelist=node04 +#SBATCH --gres=gpu:1 +#SBATCH --time=14-00:00:00 +#SBATCH --mem=5G +#SBATCH --cpus-per-task=4 +#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_revised50.out + cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer + + ml purge + ml load cuda/12.1 + eval "$(conda shell.bash hook)" + conda activate referformer + + python3 mbench/gpt_ref-ytvos_revised.py \ No newline at end of file diff --git a/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250124085144.sh b/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250124085144.sh new file mode 100644 index 0000000000000000000000000000000000000000..ebc3e3eb87ce0237841b3d0e21bea3399918ffaa --- /dev/null +++ b/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250124085144.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +#SBATCH --job-name=mbench_gpt_ref-ytvos_revised50 +#SBATCH --partition=a5000 +#SBATCH --nodelist=node04 +#SBATCH --gres=gpu:1 +#SBATCH --time=14-00:00:00 +#SBATCH --mem=5G +#SBATCH --cpus-per-task=4 +#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_revised50.out + cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer + + ml purge + ml load cuda/12.1 + eval "$(conda shell.bash hook)" + conda activate referformer + + python3 mbench/gpt_ref-ytvos-revised.py \ No newline at end of file diff --git a/.history/slurm_script/mbench_gpt_ref-ytvos_20250119070944.sh b/.history/slurm_script/mbench_gpt_ref-ytvos_20250119070944.sh new file mode 100644 index 0000000000000000000000000000000000000000..5f508bfcaa6330ddfe61012d5cd8f8968f58eee7 --- /dev/null +++ b/.history/slurm_script/mbench_gpt_ref-ytvos_20250119070944.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +#SBATCH --job-name=mbench_gpt_ref-ytvos +#SBATCH --partition=a4000 +#SBATCH --nodelist=node05 +#SBATCH --gres=gpu:1 +#SBATCH --time=14-00:00:00 +#SBATCH --mem=5G +#SBATCH --cpus-per-task=4 +#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos.out + cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer + + ml purge + ml load cuda/12.1 + eval "$(conda shell.bash hook)" + conda activate referformer + + python3 mbench/gpt_ref-ytvos.py \ No newline at end of file diff --git a/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130190228.sh b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130190228.sh new file mode 100644 index 0000000000000000000000000000000000000000..6efa2f04f01effd7e59d092a9e0302505d2b7366 --- /dev/null +++ b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130190228.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered +#SBATCH --partition=a4000 +#SBATCH --nodelist=node05 +#SBATCH --gres=gpu:1 +#SBATCH --time=14-00:00:00 +#SBATCH --mem=5G +#SBATCH --cpus-per-task=4 +#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out + cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer + + ml purge + ml load cuda/12.1 + eval "$(conda shell.bash hook)" + conda activate referformer + + python3 mbench/gpt_ref-ytvos_numbered_cy.py \ + --save_caption_path mbench/numbered_captions.json \ + --save_valid_obj_ids_path mbench/numbered_valid_obj_ids.json \ No newline at end of file diff --git a/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250201140706.sh b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250201140706.sh new file mode 100644 index 0000000000000000000000000000000000000000..214982940b3825256ee2667dd84ff0c0b7e328f0 --- /dev/null +++ b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250201140706.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered +#SBATCH --partition=a4000 +#SBATCH --nodelist=node05 +#SBATCH --gres=gpu:1 +#SBATCH --time=14-00:00:00 +#SBATCH --mem=5G +#SBATCH --cpus-per-task=4 +#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out + cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer + + ml purge + ml load cuda/12.1 + eval "$(conda shell.bash hook)" + conda activate referformer + + python3 mbench/gpt_ref-ytvos_numbered_cy.py \ + --save_caption_path mbench/numbered_captions_gpt-4o.json \ + --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o.json \ No newline at end of file diff --git a/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250202183206.sh b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250202183206.sh new file mode 100644 index 0000000000000000000000000000000000000000..9ad0c1b0c158086bcc48659bedcd1edbbffb8ccb --- /dev/null +++ b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250202183206.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered +#SBATCH --partition=a4000 +#SBATCH --nodelist=node05 +#SBATCH --gres=gpu:1 +#SBATCH --time=14-00:00:00 +#SBATCH --mem=5G +#SBATCH --cpus-per-task=4 +#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out + cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer + + ml purge + ml load cuda/12.1 + eval "$(conda shell.bash hook)" + conda activate referformer + + python3 mbench/gpt_ref-ytvos_numbered_cy.py \ + --save_caption_path mbench/numbered_captions_gpt-4o_no_mask_color.json \ + --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o_no_mask_color.json \ No newline at end of file diff --git a/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207171604.sh b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207171604.sh new file mode 100644 index 0000000000000000000000000000000000000000..a498a1739c34ce060b0e8802a68c2c2ca896c1cc --- /dev/null +++ b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207171604.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered_final +#SBATCH --partition=a4000 +#SBATCH --nodelist=node05 +#SBATCH --gres=gpu:1 +#SBATCH --time=14-00:00:00 +#SBATCH --mem=5G +#SBATCH --cpus-per-task=4 +#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered_final.out + cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer + + ml purge + ml load cuda/12.1 + eval "$(conda shell.bash hook)" + conda activate referformer + + python3 mbench/gpt_ref-ytvos_numbered_cy_sanity_2.py \ + --save_caption_path mbench/numbered_captions_gpt-4o_final.json \ + --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o_final.json \ No newline at end of file diff --git a/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207172920.sh b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207172920.sh new file mode 100644 index 0000000000000000000000000000000000000000..27693ebe2eec10425b7ea8820129e0cbeb838ab1 --- /dev/null +++ b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207172920.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered_final +#SBATCH --partition=a5000 +#SBATCH --nodelist=node04 +#SBATCH --gres=gpu:1 +#SBATCH --time=14-00:00:00 +#SBATCH --mem=5G +#SBATCH --cpus-per-task=4 +#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered_final.out + cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer + + ml purge + ml load cuda/12.1 + eval "$(conda shell.bash hook)" + conda activate referformer + + python3 mbench/gpt_ref-ytvos_numbered_cy_sanity_2.py \ + --save_caption_path mbench/numbered_captions_gpt-4o_final.json \ + --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o_final.json \ No newline at end of file diff --git a/hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/ca26d90c9e8e071d0bc31b570aef68306d0be1db4330471d10a117061a15a991.lock b/hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/ca26d90c9e8e071d0bc31b570aef68306d0be1db4330471d10a117061a15a991.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/hf_cache/models--zhiqiulin--clip-flant5-xxl/.no_exist/89bad6fffe1126b24d4360c1e1f69145eb6103aa/pytorch_model.bin b/hf_cache/models--zhiqiulin--clip-flant5-xxl/.no_exist/89bad6fffe1126b24d4360c1e1f69145eb6103aa/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b new file mode 100644 index 0000000000000000000000000000000000000000..96cf756627594683e4d906d9b3ebd56ed7d7bc5c --- /dev/null +++ b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b +size 9999791010 diff --git a/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/cc6c13cb9acd48b061e2d2664a50963c338b4998 b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/cc6c13cb9acd48b061e2d2664a50963c338b4998 new file mode 100644 index 0000000000000000000000000000000000000000..cc6c13cb9acd48b061e2d2664a50963c338b4998 --- /dev/null +++ b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/cc6c13cb9acd48b061e2d2664a50963c338b4998 @@ -0,0 +1,962 @@ +{ + "metadata": { + "total_size": 22919639040 + }, + "weight_map": { + "decoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "decoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "decoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00001-of-00003.bin", + "decoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "decoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "decoder.block.0.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "decoder.block.0.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "decoder.block.0.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "decoder.block.0.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "decoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "decoder.block.0.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.20.layer.1.EncDecAttention.k.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.20.layer.1.EncDecAttention.o.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.20.layer.1.EncDecAttention.q.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.20.layer.1.EncDecAttention.v.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.20.layer.2.DenseReluDense.wo.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.20.layer.2.layer_norm.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.21.layer.1.EncDecAttention.k.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.21.layer.1.EncDecAttention.o.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.21.layer.1.EncDecAttention.q.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.21.layer.1.EncDecAttention.v.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.21.layer.2.DenseReluDense.wo.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.21.layer.2.layer_norm.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.22.layer.1.EncDecAttention.k.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.22.layer.1.EncDecAttention.o.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.22.layer.1.EncDecAttention.q.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.22.layer.1.EncDecAttention.v.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.22.layer.2.DenseReluDense.wo.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.22.layer.2.layer_norm.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.23.layer.1.EncDecAttention.k.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.23.layer.1.EncDecAttention.o.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.23.layer.1.EncDecAttention.q.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.23.layer.1.EncDecAttention.v.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.23.layer.2.DenseReluDense.wo.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.23.layer.2.layer_norm.weight": "pytorch_model-00003-of-00003.bin", + "decoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.embed_tokens.weight": "pytorch_model-00001-of-00003.bin", + "decoder.final_layer_norm.weight": "pytorch_model-00003-of-00003.bin", + "encoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.embed_tokens.weight": "pytorch_model-00001-of-00003.bin", + "encoder.final_layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "lm_head.weight": "pytorch_model-00003-of-00003.bin", + "mm_projector.0.bias": "pytorch_model-00003-of-00003.bin", + "mm_projector.0.weight": "pytorch_model-00003-of-00003.bin", + "mm_projector.2.bias": "pytorch_model-00003-of-00003.bin", + "mm_projector.2.weight": "pytorch_model-00003-of-00003.bin", + "shared.weight": "pytorch_model-00001-of-00003.bin", + "vision_tower.vision_tower.vision_model.embeddings.class_embedding": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.post_layernorm.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.post_layernorm.weight": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.pre_layrnorm.bias": "pytorch_model-00003-of-00003.bin", + "vision_tower.vision_tower.vision_model.pre_layrnorm.weight": "pytorch_model-00003-of-00003.bin" + } +} diff --git a/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/cf1c08b23cfa58fa714ab5a4a233b9b42ee9bb9b b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/cf1c08b23cfa58fa714ab5a4a233b9b42ee9bb9b new file mode 100644 index 0000000000000000000000000000000000000000..cf1c08b23cfa58fa714ab5a4a233b9b42ee9bb9b --- /dev/null +++ b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/cf1c08b23cfa58fa714ab5a4a233b9b42ee9bb9b @@ -0,0 +1,962 @@ +{ + "metadata": { + "total_size": 22919639040 + }, + "weight_map": { + "decoder.block.0.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "decoder.block.0.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "decoder.block.0.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00001-of-00003.safetensors", + "decoder.block.0.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "decoder.block.0.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "decoder.block.0.layer.1.EncDecAttention.k.weight": "model-00001-of-00003.safetensors", + "decoder.block.0.layer.1.EncDecAttention.o.weight": "model-00001-of-00003.safetensors", + "decoder.block.0.layer.1.EncDecAttention.q.weight": "model-00001-of-00003.safetensors", + "decoder.block.0.layer.1.EncDecAttention.v.weight": "model-00001-of-00003.safetensors", + "decoder.block.0.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "decoder.block.0.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.0.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.1.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.1.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.1.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.1.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.1.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.1.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.1.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.1.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.1.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.1.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.1.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.1.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.10.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.10.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.10.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.10.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.10.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.10.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.10.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.10.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.10.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.10.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.10.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.10.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.11.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.11.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.11.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.11.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.11.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.11.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.11.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.11.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.11.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.11.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.11.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.11.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.12.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.12.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.12.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.12.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.12.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.12.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.12.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.12.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.12.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.12.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.12.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.12.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.13.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.13.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.13.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.13.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.13.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.13.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.13.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.13.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.13.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.13.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.13.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.13.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.14.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.14.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.14.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.14.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.14.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.14.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.14.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.14.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.14.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.14.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.14.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.14.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.15.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.15.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.15.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.15.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.15.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.15.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.15.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.15.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.15.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.15.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.15.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.15.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.16.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.16.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.16.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.16.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.16.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.16.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.16.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.16.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.16.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.16.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.16.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.16.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.17.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.17.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.17.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.17.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.17.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.17.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.17.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.17.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.17.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.17.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.17.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.17.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.18.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.18.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.18.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.18.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.18.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.18.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.18.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.18.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.18.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.18.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.18.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.18.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.19.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.19.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.19.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.19.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.19.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.19.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.19.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.19.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.19.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.19.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.19.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.19.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.2.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.2.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.2.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.2.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.2.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.2.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.2.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.2.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.2.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.2.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.2.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.2.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.20.layer.0.SelfAttention.k.weight": "model-00003-of-00003.safetensors", + "decoder.block.20.layer.0.SelfAttention.o.weight": "model-00003-of-00003.safetensors", + "decoder.block.20.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.20.layer.0.SelfAttention.v.weight": "model-00003-of-00003.safetensors", + "decoder.block.20.layer.0.layer_norm.weight": "model-00003-of-00003.safetensors", + "decoder.block.20.layer.1.EncDecAttention.k.weight": "model-00003-of-00003.safetensors", + "decoder.block.20.layer.1.EncDecAttention.o.weight": "model-00003-of-00003.safetensors", + "decoder.block.20.layer.1.EncDecAttention.q.weight": "model-00003-of-00003.safetensors", + "decoder.block.20.layer.1.EncDecAttention.v.weight": "model-00003-of-00003.safetensors", + "decoder.block.20.layer.1.layer_norm.weight": "model-00003-of-00003.safetensors", + "decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00003.safetensors", + "decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00003.safetensors", + "decoder.block.20.layer.2.DenseReluDense.wo.weight": "model-00003-of-00003.safetensors", + "decoder.block.20.layer.2.layer_norm.weight": "model-00003-of-00003.safetensors", + "decoder.block.21.layer.0.SelfAttention.k.weight": "model-00003-of-00003.safetensors", + "decoder.block.21.layer.0.SelfAttention.o.weight": "model-00003-of-00003.safetensors", + "decoder.block.21.layer.0.SelfAttention.q.weight": "model-00003-of-00003.safetensors", + "decoder.block.21.layer.0.SelfAttention.v.weight": "model-00003-of-00003.safetensors", + "decoder.block.21.layer.0.layer_norm.weight": "model-00003-of-00003.safetensors", + "decoder.block.21.layer.1.EncDecAttention.k.weight": "model-00003-of-00003.safetensors", + "decoder.block.21.layer.1.EncDecAttention.o.weight": "model-00003-of-00003.safetensors", + "decoder.block.21.layer.1.EncDecAttention.q.weight": "model-00003-of-00003.safetensors", + "decoder.block.21.layer.1.EncDecAttention.v.weight": "model-00003-of-00003.safetensors", + "decoder.block.21.layer.1.layer_norm.weight": "model-00003-of-00003.safetensors", + "decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00003.safetensors", + "decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00003.safetensors", + "decoder.block.21.layer.2.DenseReluDense.wo.weight": "model-00003-of-00003.safetensors", + "decoder.block.21.layer.2.layer_norm.weight": "model-00003-of-00003.safetensors", + "decoder.block.22.layer.0.SelfAttention.k.weight": "model-00003-of-00003.safetensors", + "decoder.block.22.layer.0.SelfAttention.o.weight": "model-00003-of-00003.safetensors", + "decoder.block.22.layer.0.SelfAttention.q.weight": "model-00003-of-00003.safetensors", + "decoder.block.22.layer.0.SelfAttention.v.weight": "model-00003-of-00003.safetensors", + "decoder.block.22.layer.0.layer_norm.weight": "model-00003-of-00003.safetensors", + "decoder.block.22.layer.1.EncDecAttention.k.weight": "model-00003-of-00003.safetensors", + "decoder.block.22.layer.1.EncDecAttention.o.weight": "model-00003-of-00003.safetensors", + "decoder.block.22.layer.1.EncDecAttention.q.weight": "model-00003-of-00003.safetensors", + "decoder.block.22.layer.1.EncDecAttention.v.weight": "model-00003-of-00003.safetensors", + "decoder.block.22.layer.1.layer_norm.weight": "model-00003-of-00003.safetensors", + "decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00003.safetensors", + "decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00003.safetensors", + "decoder.block.22.layer.2.DenseReluDense.wo.weight": "model-00003-of-00003.safetensors", + "decoder.block.22.layer.2.layer_norm.weight": "model-00003-of-00003.safetensors", + "decoder.block.23.layer.0.SelfAttention.k.weight": "model-00003-of-00003.safetensors", + "decoder.block.23.layer.0.SelfAttention.o.weight": "model-00003-of-00003.safetensors", + "decoder.block.23.layer.0.SelfAttention.q.weight": "model-00003-of-00003.safetensors", + "decoder.block.23.layer.0.SelfAttention.v.weight": "model-00003-of-00003.safetensors", + "decoder.block.23.layer.0.layer_norm.weight": "model-00003-of-00003.safetensors", + "decoder.block.23.layer.1.EncDecAttention.k.weight": "model-00003-of-00003.safetensors", + "decoder.block.23.layer.1.EncDecAttention.o.weight": "model-00003-of-00003.safetensors", + "decoder.block.23.layer.1.EncDecAttention.q.weight": "model-00003-of-00003.safetensors", + "decoder.block.23.layer.1.EncDecAttention.v.weight": "model-00003-of-00003.safetensors", + "decoder.block.23.layer.1.layer_norm.weight": "model-00003-of-00003.safetensors", + "decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00003.safetensors", + "decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00003.safetensors", + "decoder.block.23.layer.2.DenseReluDense.wo.weight": "model-00003-of-00003.safetensors", + "decoder.block.23.layer.2.layer_norm.weight": "model-00003-of-00003.safetensors", + "decoder.block.3.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.3.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.3.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.3.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.3.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.3.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.3.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.3.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.3.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.3.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.3.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.3.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.4.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.4.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.4.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.4.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.4.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.4.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.4.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.4.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.4.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.4.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.4.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.4.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.5.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.5.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.5.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.5.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.5.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.5.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.5.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.5.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.5.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.5.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.5.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.5.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.6.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.6.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.6.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.6.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.6.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.6.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.6.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.6.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.6.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.6.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.6.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.6.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.7.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.7.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.7.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.7.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.7.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.7.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.7.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.7.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.7.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.7.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.7.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.7.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.8.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.8.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.8.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.8.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.8.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.8.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.8.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.8.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.8.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.8.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.8.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.8.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.9.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.9.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.9.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.9.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.9.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.9.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors", + "decoder.block.9.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors", + "decoder.block.9.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors", + "decoder.block.9.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors", + "decoder.block.9.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors", + "decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors", + "decoder.block.9.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors", + "decoder.block.9.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors", + "decoder.embed_tokens.weight": "model-00001-of-00003.safetensors", + "decoder.final_layer_norm.weight": "model-00003-of-00003.safetensors", + "encoder.block.0.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.0.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.0.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00001-of-00003.safetensors", + "encoder.block.0.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.0.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.0.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.0.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.1.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.1.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.1.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.1.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.1.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.1.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.1.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.10.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.10.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.10.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.10.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.10.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.10.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.10.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.11.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.11.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.11.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.11.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.11.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.11.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.11.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.12.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.12.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.12.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.12.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.12.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.12.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.12.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.13.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.13.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.13.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.13.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.13.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.13.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.13.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.14.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.14.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.14.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.14.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.14.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.14.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.14.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.15.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.15.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.15.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.15.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.15.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.15.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.15.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.16.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.16.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.16.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.16.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.16.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.16.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.16.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.17.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.17.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.17.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.17.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.17.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.17.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.17.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.18.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.18.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.18.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.18.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.18.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.18.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.18.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.19.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.19.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.19.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.19.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.19.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.19.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.19.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.2.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.2.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.2.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.2.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.2.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.2.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.2.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.20.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.20.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.20.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.20.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.20.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.20.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.20.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.21.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.21.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.21.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.21.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.21.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.21.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.21.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.22.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.22.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.22.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.22.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.22.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.22.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.22.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.23.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.23.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.23.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.23.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.23.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.23.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.23.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.3.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.3.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.3.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.3.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.3.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.3.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.3.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.4.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.4.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.4.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.4.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.4.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.4.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.4.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.5.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.5.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.5.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.5.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.5.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.5.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.5.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.6.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.6.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.6.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.6.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.6.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.6.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.6.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.7.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.7.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.7.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.7.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.7.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.7.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.7.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.8.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.8.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.8.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.8.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.8.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.8.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.8.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.9.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors", + "encoder.block.9.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors", + "encoder.block.9.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors", + "encoder.block.9.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors", + "encoder.block.9.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors", + "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors", + "encoder.block.9.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors", + "encoder.block.9.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors", + "encoder.embed_tokens.weight": "model-00001-of-00003.safetensors", + "encoder.final_layer_norm.weight": "model-00001-of-00003.safetensors", + "lm_head.weight": "model-00003-of-00003.safetensors", + "mm_projector.0.bias": "model-00003-of-00003.safetensors", + "mm_projector.0.weight": "model-00003-of-00003.safetensors", + "mm_projector.2.bias": "model-00003-of-00003.safetensors", + "mm_projector.2.weight": "model-00003-of-00003.safetensors", + "shared.weight": "model-00001-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.embeddings.class_embedding": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.pre_layrnorm.bias": "model-00003-of-00003.safetensors", + "vision_tower.vision_tower.vision_model.pre_layrnorm.weight": "model-00003-of-00003.safetensors" + } +} \ No newline at end of file diff --git a/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/e7dbc990f8ede75b1ad2fd17028fbd89a950286a b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/e7dbc990f8ede75b1ad2fd17028fbd89a950286a new file mode 100644 index 0000000000000000000000000000000000000000..e7dbc990f8ede75b1ad2fd17028fbd89a950286a --- /dev/null +++ b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/e7dbc990f8ede75b1ad2fd17028fbd89a950286a @@ -0,0 +1,44 @@ +{ + "_name_or_path": "google/flan-t5-xxl", + "architectures": [ + "CLIPT5ForConditionalGeneration" + ], + "d_ff": 10240, + "d_kv": 64, + "d_model": 4096, + "decoder_start_token_id": 0, + "dense_act_fn": "gelu_new", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "gated-gelu", + "freeze_mm_mlp_adapter": false, + "image_aspect_ratio": "pad", + "image_grid_pinpoints": null, + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": true, + "layer_norm_epsilon": 1e-06, + "mm_hidden_size": 1024, + "mm_projector_type": "mlp2x_gelu", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "openai/clip-vit-large-patch14-336", + "model_type": "t5", + "num_decoder_layers": 24, + "num_heads": 64, + "num_layers": 24, + "output_past": true, + "pad_token_id": 0, + "prefix_mask": false, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.31.0", + "tune_mm_mlp_adapter": false, + "use_cache": true, + "use_mm_proj": true, + "vocab_size": 32128 +} diff --git a/hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/config.json b/hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e7dbc990f8ede75b1ad2fd17028fbd89a950286a --- /dev/null +++ b/hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/config.json @@ -0,0 +1,44 @@ +{ + "_name_or_path": "google/flan-t5-xxl", + "architectures": [ + "CLIPT5ForConditionalGeneration" + ], + "d_ff": 10240, + "d_kv": 64, + "d_model": 4096, + "decoder_start_token_id": 0, + "dense_act_fn": "gelu_new", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "gated-gelu", + "freeze_mm_mlp_adapter": false, + "image_aspect_ratio": "pad", + "image_grid_pinpoints": null, + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": true, + "layer_norm_epsilon": 1e-06, + "mm_hidden_size": 1024, + "mm_projector_type": "mlp2x_gelu", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "openai/clip-vit-large-patch14-336", + "model_type": "t5", + "num_decoder_layers": 24, + "num_heads": 64, + "num_layers": 24, + "output_past": true, + "pad_token_id": 0, + "prefix_mask": false, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.31.0", + "tune_mm_mlp_adapter": false, + "use_cache": true, + "use_mm_proj": true, + "vocab_size": 32128 +} diff --git a/hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/generation_config.json b/hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7528dbb1b6ce860d242aff71294a5fef12a41572 --- /dev/null +++ b/hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.31.0" +} diff --git a/inference_ytvos.py b/inference_ytvos.py new file mode 100644 index 0000000000000000000000000000000000000000..7c24df6901de051432b5c975d819b6abcdb53bd4 --- /dev/null +++ b/inference_ytvos.py @@ -0,0 +1,326 @@ +''' +Inference code for ReferFormer, on Ref-Youtube-VOS +Modified from DETR (https://github.com/facebookresearch/detr) +''' +import argparse +import json +import random +import time +from pathlib import Path + +import numpy as np +import torch + +import util.misc as utils +from models import build_model +import torchvision.transforms as T +import matplotlib.pyplot as plt +import os +import cv2 +from PIL import Image, ImageDraw +import math +import torch.nn.functional as F +import json + +import opts +from tqdm import tqdm + +import multiprocessing as mp +import threading + +from tools.colormap import colormap + + +# colormap +color_list = colormap() +color_list = color_list.astype('uint8').tolist() + +# build transform +transform = T.Compose([ + T.Resize(360), + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) +]) + + +def main(args): + args.masks = True + args.batch_size == 1 + print("Inference only supports for batch size = 1") + + # fix the seed for reproducibility + seed = args.seed + utils.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + + split = args.split + # save path + output_dir = args.output_dir + save_path_prefix = os.path.join(output_dir, split) + if not os.path.exists(save_path_prefix): + os.makedirs(save_path_prefix) + + save_visualize_path_prefix = os.path.join(output_dir, split + '_images') + if args.visualize: + if not os.path.exists(save_visualize_path_prefix): + os.makedirs(save_visualize_path_prefix) + + # load data + root = Path(args.ytvos_path) # data/ref-youtube-vos + img_folder = os.path.join(root, split, "JPEGImages") + meta_file = os.path.join(root, "meta_expressions", split, "meta_expressions.json") + with open(meta_file, "r") as f: + data = json.load(f)["videos"] + valid_test_videos = set(data.keys()) + # for some reasons the competition's validation expressions dict contains both the validation (202) & + # test videos (305). so we simply load the test expressions dict and use it to filter out the test videos from + # the validation expressions dict: + test_meta_file = os.path.join(root, "meta_expressions", "test", "meta_expressions.json") + with open(test_meta_file, 'r') as f: + test_data = json.load(f)['videos'] + test_videos = set(test_data.keys()) + valid_videos = valid_test_videos - test_videos + video_list = sorted([video for video in valid_videos]) + assert len(video_list) == 202, 'error: incorrect number of validation videos' + + # create subprocess + thread_num = args.ngpu + global result_dict + result_dict = mp.Manager().dict() + + processes = [] + lock = threading.Lock() + + video_num = len(video_list) + per_thread_video_num = video_num // thread_num + + start_time = time.time() + print('Start inference') + for i in range(thread_num): + if i == thread_num - 1: + sub_video_list = video_list[i * per_thread_video_num:] + else: + sub_video_list = video_list[i * per_thread_video_num: (i + 1) * per_thread_video_num] + p = mp.Process(target=sub_processor, args=(lock, i, args, data, + save_path_prefix, save_visualize_path_prefix, + img_folder, sub_video_list)) + p.start() + processes.append(p) + + for p in processes: + p.join() + + end_time = time.time() + total_time = end_time - start_time + + result_dict = dict(result_dict) + num_all_frames_gpus = 0 + for pid, num_all_frames in result_dict.items(): + num_all_frames_gpus += num_all_frames + + print("Total inference time: %.4f s" %(total_time)) + +def sub_processor(lock, pid, args, data, save_path_prefix, save_visualize_path_prefix, img_folder, video_list): + text = 'processor %d' % pid + with lock: + progress = tqdm( + total=len(video_list), + position=pid, + desc=text, + ncols=0 + ) + torch.cuda.set_device(pid) + + # model + model, criterion, _ = build_model(args) + device = args.device + model.to(device) + + model_without_ddp = model + n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + + if pid == 0: + print('number of params:', n_parameters) + + if args.resume: + checkpoint = torch.load(args.resume, map_location='cpu') + missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False) + unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))] + if len(missing_keys) > 0: + print('Missing Keys: {}'.format(missing_keys)) + if len(unexpected_keys) > 0: + print('Unexpected Keys: {}'.format(unexpected_keys)) + else: + raise ValueError('Please specify the checkpoint for inference.') + + + # start inference + num_all_frames = 0 + model.eval() + + # 1. For each video + for video in video_list: + metas = [] # list[dict], length is number of expressions + + expressions = data[video]["expressions"] + expression_list = list(expressions.keys()) + num_expressions = len(expression_list) + video_len = len(data[video]["frames"]) + + # read all the anno meta + for i in range(num_expressions): + meta = {} + meta["video"] = video + meta["exp"] = expressions[expression_list[i]]["exp"] + meta["exp_id"] = expression_list[i] + meta["frames"] = data[video]["frames"] + metas.append(meta) + meta = metas + + # 2. For each expression + for i in range(num_expressions): + video_name = meta[i]["video"] + exp = meta[i]["exp"] + exp_id = meta[i]["exp_id"] + frames = meta[i]["frames"] + + video_len = len(frames) + # store images + imgs = [] + for t in range(video_len): + frame = frames[t] + img_path = os.path.join(img_folder, video_name, frame + ".jpg") + img = Image.open(img_path).convert('RGB') + origin_w, origin_h = img.size + imgs.append(transform(img)) # list[img] + + imgs = torch.stack(imgs, dim=0).to(args.device) # [video_len, 3, h, w] + img_h, img_w = imgs.shape[-2:] + size = torch.as_tensor([int(img_h), int(img_w)]).to(args.device) + target = {"size": size} + + with torch.no_grad(): + outputs = model([imgs], [exp], [target]) + + pred_logits = outputs["pred_logits"][0] + pred_boxes = outputs["pred_boxes"][0] + pred_masks = outputs["pred_masks"][0] + pred_ref_points = outputs["reference_points"][0] + + # according to pred_logits, select the query index + pred_scores = pred_logits.sigmoid() # [t, q, k] + pred_scores = pred_scores.mean(0) # [q, k] + max_scores, _ = pred_scores.max(-1) # [q,] + _, max_ind = max_scores.max(-1) # [1,] + max_inds = max_ind.repeat(video_len) + pred_masks = pred_masks[range(video_len), max_inds, ...] # [t, h, w] + pred_masks = pred_masks.unsqueeze(0) + + pred_masks = F.interpolate(pred_masks, size=(origin_h, origin_w), mode='bilinear', align_corners=False) + pred_masks = (pred_masks.sigmoid() > args.threshold).squeeze(0).detach().cpu().numpy() + + # store the video results + all_pred_logits = pred_logits[range(video_len), max_inds] + all_pred_boxes = pred_boxes[range(video_len), max_inds] + all_pred_ref_points = pred_ref_points[range(video_len), max_inds] + all_pred_masks = pred_masks + + if args.visualize: + for t, frame in enumerate(frames): + # original + img_path = os.path.join(img_folder, video_name, frame + '.jpg') + source_img = Image.open(img_path).convert('RGBA') # PIL image + + draw = ImageDraw.Draw(source_img) + draw_boxes = all_pred_boxes[t].unsqueeze(0) + draw_boxes = rescale_bboxes(draw_boxes.detach(), (origin_w, origin_h)).tolist() + + # draw boxes + xmin, ymin, xmax, ymax = draw_boxes[0] + draw.rectangle(((xmin, ymin), (xmax, ymax)), outline=tuple(color_list[i%len(color_list)]), width=2) + + # draw reference point + ref_points = all_pred_ref_points[t].unsqueeze(0).detach().cpu().tolist() + draw_reference_points(draw, ref_points, source_img.size, color=color_list[i%len(color_list)]) + + # draw mask + source_img = vis_add_mask(source_img, all_pred_masks[t], color_list[i%len(color_list)]) + + # save + save_visualize_path_dir = os.path.join(save_visualize_path_prefix, video, str(i)) + if not os.path.exists(save_visualize_path_dir): + os.makedirs(save_visualize_path_dir) + save_visualize_path = os.path.join(save_visualize_path_dir, frame + '.png') + source_img.save(save_visualize_path) + + + # save binary image + save_path = os.path.join(save_path_prefix, video_name, exp_id) + if not os.path.exists(save_path): + os.makedirs(save_path) + for j in range(video_len): + frame_name = frames[j] + mask = all_pred_masks[j].astype(np.float32) + mask = Image.fromarray(mask * 255).convert('L') + save_file = os.path.join(save_path, frame_name + ".png") + mask.save(save_file) + + with lock: + progress.update(1) + result_dict[str(pid)] = num_all_frames + with lock: + progress.close() + + +# visuaize functions +def box_cxcywh_to_xyxy(x): + x_c, y_c, w, h = x.unbind(1) + b = [(x_c - 0.5 * w), (y_c - 0.5 * h), + (x_c + 0.5 * w), (y_c + 0.5 * h)] + return torch.stack(b, dim=1) + +def rescale_bboxes(out_bbox, size): + img_w, img_h = size + b = box_cxcywh_to_xyxy(out_bbox) + b = b.cpu() * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32) + return b + + +# Visualization functions +def draw_reference_points(draw, reference_points, img_size, color): + W, H = img_size + for i, ref_point in enumerate(reference_points): + init_x, init_y = ref_point + x, y = W * init_x, H * init_y + cur_color = color + draw.line((x-10, y, x+10, y), tuple(cur_color), width=4) + draw.line((x, y-10, x, y+10), tuple(cur_color), width=4) + +def draw_sample_points(draw, sample_points, img_size, color_list): + alpha = 255 + for i, samples in enumerate(sample_points): + for sample in samples: + x, y = sample + cur_color = color_list[i % len(color_list)][::-1] + cur_color += [alpha] + draw.ellipse((x-2, y-2, x+2, y+2), + fill=tuple(cur_color), outline=tuple(cur_color), width=1) + +def vis_add_mask(img, mask, color): + origin_img = np.asarray(img.convert('RGB')).copy() + color = np.array(color) + + mask = mask.reshape(mask.shape[0], mask.shape[1]).astype('uint8') # np + mask = mask > 0.5 + + origin_img[mask] = origin_img[mask] * 0.5 + color * 0.5 + origin_img = Image.fromarray(origin_img) + return origin_img + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('ReferFormer inference script', parents=[opts.get_args_parser()]) + args = parser.parse_args() + main(args) diff --git a/logs/gpt_ref-ytvos_numbered_cy_sanity.log b/logs/gpt_ref-ytvos_numbered_cy_sanity.log new file mode 100644 index 0000000000000000000000000000000000000000..08b51ea286b4289e93268a3b1d435e245c74af17 --- /dev/null +++ b/logs/gpt_ref-ytvos_numbered_cy_sanity.log @@ -0,0 +1,5967 @@ +skipped 57 short videos + + video num: 3471 clip num: 3414 + + +vid id: 003234408d + +-----------category name: penguin, frame name: 3 +are penguins distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: penguin, frame name: 12 +are penguins distinguished by action: YES + +-----------category name: penguin, frame name: 25 +are penguins distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: penguin, frame name: 32 +are penguins distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0043f083b5 + +Skipping bus: There is single or no object. + +Skipping bus: There is single or no object. + +Skipping bus: There is single or no object. + +Skipping bus: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +-----------category name: sedan, frame name: 10 +are sedans distinguished by action: NONE + +-----------category name: sedan, frame name: 14 +are sedans distinguished by action: "NONE" + +vid id: 0044fa5fba + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 005a527edd + +-----------category name: ape, frame name: 4 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: ape, frame name: 9 +are apes distinguished by action: YES + +-----------category name: ape, frame name: 15 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: ape, frame name: 24 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0065b171f9 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 00917dcfc4 + +-----------category name: zebra, frame name: 3 +are zebras distinguished by action: NONE + +-----------category name: zebra, frame name: 6 +are zebras distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: zebra, frame name: 12 +are zebras distinguished by action: YES + +-----------category name: zebra, frame name: 16 +are zebras distinguished by action: YES + +vid id: 00a23ccf53 + +Skipping shark: There is single or no object. + +Skipping shark: There is single or no object. + +Skipping shark: There is single or no object. + +Skipping shark: There is single or no object. + +vid id: 00ad5016a4 + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +vid id: 01082ae388 + +Skipping leopard: There is single or no object. + +Skipping leopard: There is single or no object. + +Skipping leopard: There is single or no object. + +Skipping leopard: There is single or no object. + +vid id: 011ac0a06f + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +vid id: 013099c098 + +-----------category name: giant_panda, frame name: 2 +are giant_pandas distinguished by action: YES + +-----------category name: giant_panda, frame name: 7 +are giant_pandas distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: giant_panda, frame name: 10 +are giant_pandas distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: giant_panda, frame name: 11 +are giant_pandas distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0155498c85 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +vid id: 01694ad9c8 + +Skipping bird: There is single or no object. + +Skipping bird: There is single or no object. + +Skipping bird: There is single or no object. + +Skipping bird: There is single or no object. + +vid id: 017ac35701 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 01b80e8e1a + +-----------category name: zebra, frame name: 2 +are zebras distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: zebra, frame name: 5 +are zebras distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: zebra, frame name: 7 +are zebras distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: zebra, frame name: 9 +are zebras distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 01baa5a4e1 + +Skipping frisbee: Determined to be non-movable. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +vid id: 01c3111683 + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +vid id: 01c4cb5ffe + +Skipping person: There is single or no object. + +-----------category name: person, frame name: 15 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: person, frame name: 22 +are persons distinguished by action: NONE + +Skipping person: There is single or no object. + +Skipping snowboard: Determined to be non-movable. + +Skipping snowboard: There is single or no object. + +Skipping snowboard: There is single or no object. + +Skipping snowboard: There is single or no object. + +Skipping snowboard: There is single or no object. + +vid id: 01c76f0a82 + +Skipping plant: Determined to be non-movable. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +-----------category name: sedan, frame name: 12 +are sedans distinguished by action: NONE + +-----------category name: sedan, frame name: 14 +are sedans distinguished by action: I'm unable to determine any actions or postures of sedans from images, as vehicles don't perform actions like people. + +vid id: 01c783268c + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +vid id: 01e64dd36a + +-----------category name: cow, frame name: 3 +are cows distinguished by action: NONE + +-----------category name: cow, frame name: 5 +are cows distinguished by action: YES + +-----------category name: cow, frame name: 10 +are cows distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: cow, frame name: 14 +are cows distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 01ed275c6e + +-----------category name: giraffe, frame name: 4 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: giraffe, frame name: 8 +are giraffes distinguished by action: YES + +-----------category name: giraffe, frame name: 12 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: giraffe, frame name: 16 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 01ff60d1fa + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +vid id: 020cd28cd2 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 02264db755 + +Skipping fox: There is single or no object. + +Skipping fox: There is single or no object. + +Skipping fox: There is single or no object. + +Skipping fox: There is single or no object. + +vid id: 0248626d9a + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +vid id: 02668dbffa + +Skipping frog: There is single or no object. + +Skipping frog: There is single or no object. + +Skipping frog: There is single or no object. + +Skipping frog: There is single or no object. + +vid id: 0274193026 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 02d28375aa + +Skipping fox: There is single or no object. + +Skipping fox: There is single or no object. + +Skipping fox: There is single or no object. + +Skipping fox: There is single or no object. + +vid id: 031ccc99b1 + +-----------category name: person, frame name: 4 +are persons distinguished by action: YES + +-----------category name: person, frame name: 5 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Skipping person: There is single or no object. + +-----------category name: person, frame name: 14 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +vid id: 0321b18c10 + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +-----------category name: person, frame name: 3 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 7 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 8 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 13 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0348a45bca + +-----------category name: fish, frame name: 8 +are fishs distinguished by action: NONE + +-----------category name: fish, frame name: 16 +are fishs distinguished by action: NONE + +-----------category name: fish, frame name: 19 +are fishs distinguished by action: NONE + +-----------category name: fish, frame name: 27 +are fishs distinguished by action: NONE + +vid id: 0355e92655 + +Skipping boat: There is single or no object. + +Skipping boat: There is single or no object. + +Skipping boat: There is single or no object. + +Skipping boat: There is single or no object. + +Skipping paddle: Determined to be non-movable. + +Skipping paddle: There is single or no object. + +Skipping paddle: There is single or no object. + +Skipping paddle: There is single or no object. + +Skipping paddle: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 0358b938c1 + +Skipping elephant: There is single or no object. + +-----------category name: elephant, frame name: 7 +are elephants distinguished by action: YES + +-----------category name: elephant, frame name: 9 +are elephants distinguished by action: YES + +-----------category name: elephant, frame name: 16 +are elephants distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0368107cf1 + +-----------category name: person, frame name: 2 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 6 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 10 +are persons distinguished by action: I'm sorry, I cannot identify or analyze individuals in the image provided. + +-----------category name: person, frame name: 15 +are persons distinguished by action: NONE + +vid id: 0379ddf557 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 038b2cc71d + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +vid id: 038c15a5dd + +Skipping hedgehog: There is single or no object. + +Skipping hedgehog: There is single or no object. + +Skipping hedgehog: There is single or no object. + +Skipping hedgehog: There is single or no object. + +vid id: 03a06cc98a + +-----------category name: giraffe, frame name: 5 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: giraffe, frame name: 8 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: giraffe, frame name: 12 +are giraffes distinguished by action: YES + +-----------category name: giraffe, frame name: 14 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +vid id: 03a63e187f + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +vid id: 03c95b4dae + +-----------category name: elephant, frame name: 3 +are elephants distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: elephant, frame name: 5 +are elephants distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: elephant, frame name: 10 +are elephants distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: elephant, frame name: 16 +are elephants distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 03e2b57b0e + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +vid id: 04194e1248 + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +vid id: 04259896e2 + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +vid id: 0444918a5f + +-----------category name: truck, frame name: 2 +are trucks distinguished by action: NONE + +-----------category name: truck, frame name: 9 +are trucks distinguished by action: NONE + +-----------category name: truck, frame name: 13 +are trucks distinguished by action: NONE + +-----------category name: truck, frame name: 16 +are trucks distinguished by action: NONE + +vid id: 04460a7a52 + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +vid id: 04474174a4 + +-----------category name: ape, frame name: 4 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: ape, frame name: 12 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: ape, frame name: 22 +are apes distinguished by action: NONE + +-----------category name: ape, frame name: 31 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0450095513 + +Skipping snail: There is single or no object. + +Skipping snail: There is single or no object. + +Skipping snail: There is single or no object. + +Skipping snail: There is single or no object. + +vid id: 045f00aed2 + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +Skipping others: Determined to be non-movable. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 04667fabaa + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +vid id: 04735c5030 + +-----------category name: cat, frame name: 3 +are cats distinguished by action: YES + +-----------category name: cat, frame name: 6 +are cats distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: cat, frame name: 10 +are cats distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: cat, frame name: 15 +are cats distinguished by action: YES + +Retrying caption generation... (1/3) +vid id: 04990d1915 + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping truck: There is single or no object. + +Skipping truck: There is single or no object. + +Skipping truck: There is single or no object. + +Skipping truck: There is single or no object. + +Skipping bus: There is single or no object. + +Skipping bus: There is single or no object. + +Skipping bus: There is single or no object. + +Skipping bus: There is single or no object. + +vid id: 04d62d9d98 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 04f21da964 + +Skipping monkey: There is single or no object. + +Skipping monkey: There is single or no object. + +Skipping monkey: There is single or no object. + +Skipping monkey: There is single or no object. + +vid id: 04fbad476e + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +vid id: 04fe256562 + +Skipping truck: There is single or no object. + +Skipping truck: There is single or no object. + +Skipping truck: There is single or no object. + +Skipping truck: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +vid id: 0503bf89c9 + +Skipping hedgehog: There is single or no object. + +Skipping hedgehog: There is single or no object. + +Skipping hedgehog: There is single or no object. + +Skipping hedgehog: There is single or no object. + +vid id: 0536c9eed0 + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +vid id: 054acb238f + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +vid id: 05579ca250 + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 056c200404 + +Skipping toilet: Determined to be non-movable. + +Skipping toilet: There is single or no object. + +Skipping toilet: There is single or no object. + +Skipping toilet: There is single or no object. + +Skipping toilet: There is single or no object. + +vid id: 05774f3a2c + +-----------category name: ape, frame name: 4 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: ape, frame name: 13 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: ape, frame name: 25 +are apes distinguished by action: NONE + +-----------category name: ape, frame name: 33 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 058a7592c8 + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +vid id: 05a0a513df + +-----------category name: person, frame name: 4 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 9 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 12 +are persons distinguished by action: "NONE" + +-----------category name: person, frame name: 15 +are persons distinguished by action: NONE + +vid id: 05a569d8aa + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +vid id: 05aa652648 + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +vid id: 05d7715782 + +Skipping sign: Determined to be non-movable. + +Skipping sign: There is single or no object. + +Skipping sign: There is single or no object. + +Skipping sign: There is single or no object. + +Skipping sign: There is single or no object. + +vid id: 05e0b0f28f + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +vid id: 05fdbbdd7a + +Skipping umbrella: Determined to be non-movable. + +vid id: 05ffcfed85 + +Skipping monkey: There is single or no object. + +-----------category name: monkey, frame name: 15 +are monkeys distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: monkey, frame name: 22 +are monkeys distinguished by action: YES + +Retrying caption generation... (1/3) +Skipping monkey: There is single or no object. + +vid id: 0630391881 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping tennis_racket: Determined to be non-movable. + +Skipping tennis_racket: There is single or no object. + +Skipping tennis_racket: There is single or no object. + +Skipping tennis_racket: There is single or no object. + +Skipping tennis_racket: There is single or no object. + +vid id: 06840b2bbe + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +vid id: 068f7dce6f + +Skipping shark: There is single or no object. + +Skipping shark: There is single or no object. + +Skipping shark: There is single or no object. + +Skipping shark: There is single or no object. + +vid id: 0693719753 + +-----------category name: turtle, frame name: 7 +are turtles distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: turtle, frame name: 12 +are turtles distinguished by action: NONE + +-----------category name: turtle, frame name: 15 +are turtles distinguished by action: "NONE" + +-----------category name: turtle, frame name: 20 +are turtles distinguished by action: NONE + +vid id: 06ce2b51fb + +Skipping paddle: Determined to be non-movable. + +Skipping paddle: There is single or no object. + +Skipping paddle: There is single or no object. + +Skipping paddle: There is single or no object. + +Skipping paddle: There is single or no object. + +-----------category name: person, frame name: 2 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 7 +are persons distinguished by action: YES + +-----------category name: person, frame name: 9 +are persons distinguished by action: "NONE" + +-----------category name: person, frame name: 12 +are persons distinguished by action: NONE + +vid id: 06e224798e + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +vid id: 06ee361788 + +-----------category name: duck, frame name: 3 +are ducks distinguished by action: NONE + +-----------category name: duck, frame name: 6 +are ducks distinguished by action: YES + +-----------category name: duck, frame name: 10 +are ducks distinguished by action: NONE + +-----------category name: duck, frame name: 14 +are ducks distinguished by action: NONE + +vid id: 06fbb3fa2c + +Skipping eagle: There is single or no object. + +Skipping eagle: There is single or no object. + +Skipping eagle: There is single or no object. + +Skipping eagle: There is single or no object. + +vid id: 0700264286 + +-----------category name: cow, frame name: 4 +are cows distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: cow, frame name: 6 +are cows distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: cow, frame name: 8 +are cows distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: cow, frame name: 12 +are cows distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 070c918ca7 + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +vid id: 07129e14a4 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +-----------category name: parrot, frame name: 2 +are parrots distinguished by action: "YES" + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: parrot, frame name: 10 +are parrots distinguished by action: YES + +-----------category name: parrot, frame name: 22 +are parrots distinguished by action: "YES" + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: parrot, frame name: 30 +are parrots distinguished by action: NONE + +vid id: 07177017e9 + +-----------category name: motorbike, frame name: 4 +are motorbikes distinguished by action: "NONE" + +-----------category name: motorbike, frame name: 6 +are motorbikes distinguished by action: NONE + +-----------category name: motorbike, frame name: 9 +are motorbikes distinguished by action: "NONE" + +-----------category name: motorbike, frame name: 13 +are motorbikes distinguished by action: "NONE" + +vid id: 07238ffc58 + +-----------category name: monkey, frame name: 6 +are monkeys distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: monkey, frame name: 14 +are monkeys distinguished by action: YES + +-----------category name: monkey, frame name: 25 +are monkeys distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: monkey, frame name: 28 +are monkeys distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 07353b2a89 + +-----------category name: sheep, frame name: 6 +are sheeps distinguished by action: NONE + +-----------category name: sheep, frame name: 8 +are sheeps distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: sheep, frame name: 17 +are sheeps distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: sheep, frame name: 25 +are sheeps distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0738493cbf + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +vid id: 075926c651 + +-----------category name: person, frame name: 2 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 075c701292 + +-----------category name: duck, frame name: 8 +are ducks distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: duck, frame name: 14 +are ducks distinguished by action: NONE + +-----------category name: duck, frame name: 18 +are ducks distinguished by action: NONE + +-----------category name: duck, frame name: 33 +are ducks distinguished by action: NONE + +vid id: 0762ea9a30 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 07652ee4af + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 076f206928 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +-----------category name: zebra, frame name: 3 +are zebras distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: zebra, frame name: 9 +are zebras distinguished by action: "NONE" + +-----------category name: zebra, frame name: 10 +are zebras distinguished by action: "NONE" + +-----------category name: zebra, frame name: 16 +are zebras distinguished by action: "NONE" + +vid id: 077d32af19 + +-----------category name: person, frame name: 5 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 9 +are persons distinguished by action: "NONE" + +-----------category name: person, frame name: 10 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 14 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +vid id: 079049275c + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +vid id: 07913cdda7 + +-----------category name: person, frame name: 4 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 7 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 9 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 11 +are persons distinguished by action: NONE + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +vid id: 07a11a35e8 + +-----------category name: ape, frame name: 6 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: ape, frame name: 15 +are apes distinguished by action: YES + +-----------category name: ape, frame name: 21 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: ape, frame name: 29 +are apes distinguished by action: YES + +vid id: 07ac33b6df + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +vid id: 07c62c3d11 + +-----------category name: parrot, frame name: 8 +are parrots distinguished by action: "NONE" + +-----------category name: parrot, frame name: 17 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: parrot, frame name: 18 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: parrot, frame name: 30 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 07cc1c7d74 + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +vid id: 080196ef01 + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping hand: Determined to be non-movable. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +vid id: 081207976e + +Skipping hat: Determined to be non-movable. + +vid id: 081ae4fa44 + +-----------category name: shark, frame name: 2 +are sharks distinguished by action: NONE + +-----------category name: shark, frame name: 13 +are sharks distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: shark, frame name: 14 +are sharks distinguished by action: YES + +-----------category name: shark, frame name: 22 +are sharks distinguished by action: "NONE" + +vid id: 081d8250cb + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 082900c5d4 + +-----------category name: duck, frame name: 4 +are ducks distinguished by action: NONE + +-----------category name: duck, frame name: 7 +are ducks distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: duck, frame name: 10 +are ducks distinguished by action: NONE + +-----------category name: duck, frame name: 17 +are ducks distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0860df21e2 + +Skipping frisbee: Determined to be non-movable. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +vid id: 0866d4c5e3 + +-----------category name: bird, frame name: 2 +are birds distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: bird, frame name: 6 +are birds distinguished by action: NONE + +-----------category name: bird, frame name: 8 +are birds distinguished by action: NONE + +-----------category name: bird, frame name: 11 +are birds distinguished by action: "NONE" + +vid id: 0891ac2eb6 + +-----------category name: person, frame name: 3 +are persons distinguished by action: YES + +-----------category name: person, frame name: 6 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: person, frame name: 10 +are persons distinguished by action: YES + +-----------category name: person, frame name: 12 +are persons distinguished by action: YES + +vid id: 08931bc458 + +Skipping others: Determined to be non-movable. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 08aa2705d5 + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +vid id: 08c8450db7 + +Skipping toilet: Determined to be non-movable. + +Skipping toilet: There is single or no object. + +Skipping toilet: There is single or no object. + +Skipping toilet: There is single or no object. + +Skipping toilet: There is single or no object. + +vid id: 08d50b926c + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +vid id: 08e1e4de15 + +-----------category name: monkey, frame name: 2 +are monkeys distinguished by action: YES + +-----------category name: monkey, frame name: 10 +are monkeys distinguished by action: YES + +-----------category name: monkey, frame name: 22 +are monkeys distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: monkey, frame name: 32 +are monkeys distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 08e48c1a48 + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +vid id: 08f561c65e + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 08feb87790 + +Skipping sheep: There is single or no object. + +Skipping sheep: There is single or no object. + +Skipping sheep: There is single or no object. + +Skipping sheep: There is single or no object. + +vid id: 09049f6fe3 + +-----------category name: mouse, frame name: 4 +are mouses distinguished by action: YES + +-----------category name: mouse, frame name: 11 +are mouses distinguished by action: "NONE" + +-----------category name: mouse, frame name: 16 +are mouses distinguished by action: "NONE" + +-----------category name: mouse, frame name: 27 +are mouses distinguished by action: NONE + +vid id: 092e4ff450 + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +vid id: 09338adea8 + +-----------category name: whale, frame name: 7 +are whales distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: whale, frame name: 11 +are whales distinguished by action: YES + +-----------category name: whale, frame name: 19 +are whales distinguished by action: NONE + +-----------category name: whale, frame name: 29 +are whales distinguished by action: "NONE" + +vid id: 093c335ccc + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping frisbee: Determined to be non-movable. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +vid id: 0970d28339 + +-----------category name: ape, frame name: 8 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: ape, frame name: 17 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: ape, frame name: 22 +are apes distinguished by action: YES + +-----------category name: ape, frame name: 33 +are apes distinguished by action: YES + +vid id: 0974a213dc + +-----------category name: giraffe, frame name: 5 +are giraffes distinguished by action: YES + +-----------category name: giraffe, frame name: 7 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: giraffe, frame name: 10 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: giraffe, frame name: 17 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 097b471ed8 + +-----------category name: cat, frame name: 2 +are cats distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: cat, frame name: 7 +are cats distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: cat, frame name: 12 +are cats distinguished by action: "NONE" + +-----------category name: cat, frame name: 14 +are cats distinguished by action: NONE + +vid id: 0990941758 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 09a348f4fa + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +vid id: 09a6841288 + +-----------category name: duck, frame name: 4 +are ducks distinguished by action: "NONE" + +-----------category name: duck, frame name: 7 +are ducks distinguished by action: NONE + +-----------category name: duck, frame name: 13 +are ducks distinguished by action: NONE + +-----------category name: duck, frame name: 16 +are ducks distinguished by action: "NONE" + +vid id: 09c5bad17b + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +vid id: 09c9ce80c7 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 09ff54fef4 + +-----------category name: fox, frame name: 5 +are foxs distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: fox, frame name: 10 +are foxs distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: fox, frame name: 17 +are foxs distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: fox, frame name: 22 +are foxs distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0a23765d15 + +-----------category name: person, frame name: 3 +are persons distinguished by action: YES + +-----------category name: person, frame name: 6 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 9 +are persons distinguished by action: YES + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 0a275e7f12 + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +vid id: 0a2f2bd294 + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +vid id: 0a7a2514aa + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +vid id: 0a7b27fde9 + +-----------category name: parrot, frame name: 6 +are parrots distinguished by action: "NONE" + +-----------category name: parrot, frame name: 12 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: parrot, frame name: 21 +are parrots distinguished by action: "NONE" + +-----------category name: parrot, frame name: 27 +are parrots distinguished by action: NONE + +vid id: 0a8c467cc3 + +-----------category name: fish, frame name: 7 +are fishs distinguished by action: YES + +-----------category name: fish, frame name: 14 +are fishs distinguished by action: NONE + +-----------category name: fish, frame name: 24 +are fishs distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: fish, frame name: 31 +are fishs distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0ac8c560ae + +-----------category name: person, frame name: 4 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 9 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 13 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 14 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0b1627e896 + +Skipping boat: There is single or no object. + +Skipping boat: There is single or no object. + +Skipping boat: There is single or no object. + +Skipping boat: There is single or no object. + +vid id: 0b285c47f6 + +Skipping hand: Determined to be non-movable. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +vid id: 0b34ec1d55 + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +vid id: 0b5b5e8e5a + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 0b68535614 + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +Skipping hand: Determined to be non-movable. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +vid id: 0b6f9105fc + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +vid id: 0b7dbfa3cb + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +vid id: 0b9cea51ca + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +vid id: 0b9d012be8 + +Skipping camel: There is single or no object. + +Skipping camel: There is single or no object. + +Skipping camel: There is single or no object. + +Skipping camel: There is single or no object. + +vid id: 0bcfc4177d + +Skipping truck: There is single or no object. + +Skipping truck: There is single or no object. + +Skipping truck: There is single or no object. + +Skipping truck: There is single or no object. + +vid id: 0bd37b23c1 + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +vid id: 0bd864064c + +Skipping eagle: There is single or no object. + +Skipping eagle: There is single or no object. + +Skipping eagle: There is single or no object. + +Skipping eagle: There is single or no object. + +vid id: 0c11c6bf7b + +Skipping deer: There is single or no object. + +Skipping deer: There is single or no object. + +Skipping deer: There is single or no object. + +Skipping deer: There is single or no object. + +vid id: 0c26bc77ac + +Skipping crocodile: There is single or no object. + +Skipping crocodile: There is single or no object. + +Skipping crocodile: There is single or no object. + +Skipping crocodile: There is single or no object. + +vid id: 0c3a04798c + +Skipping duck: There is single or no object. + +Skipping duck: There is single or no object. + +Skipping duck: There is single or no object. + +Skipping duck: There is single or no object. + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +vid id: 0c44a9d545 + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +vid id: 0c817cc390 + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +Skipping hedgehog: There is single or no object. + +Skipping hedgehog: There is single or no object. + +Skipping hedgehog: There is single or no object. + +Skipping hedgehog: There is single or no object. + +vid id: 0ca839ee9a + +-----------category name: ape, frame name: 2 +are apes distinguished by action: NONE + +-----------category name: ape, frame name: 11 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: ape, frame name: 22 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: ape, frame name: 30 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0cd7ac0ac0 + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +vid id: 0ce06e0121 + +-----------category name: parrot, frame name: 7 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: parrot, frame name: 10 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: parrot, frame name: 21 +are parrots distinguished by action: YES + +-----------category name: parrot, frame name: 31 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0cfe974a89 + +-----------category name: turtle, frame name: 7 +are turtles distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: turtle, frame name: 16 +are turtles distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: turtle, frame name: 18 +are turtles distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: turtle, frame name: 30 +are turtles distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0d2fcc0dcd + +-----------category name: zebra, frame name: 2 +are zebras distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: zebra, frame name: 8 +are zebras distinguished by action: YES + +-----------category name: zebra, frame name: 11 +are zebras distinguished by action: YES + +-----------category name: zebra, frame name: 16 +are zebras distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0d3aad05d2 + +Skipping parachute: Determined to be non-movable. + +Skipping parachute: There is single or no object. + +Skipping parachute: There is single or no object. + +Skipping parachute: There is single or no object. + +Skipping parachute: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 0d40b015f4 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping snowboard: Determined to be non-movable. + +Skipping snowboard: There is single or no object. + +Skipping snowboard: There is single or no object. + +Skipping snowboard: There is single or no object. + +Skipping snowboard: There is single or no object. + +vid id: 0d97fba242 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +vid id: 0d9cc80d7e + +-----------category name: person, frame name: 2 +are persons distinguished by action: YES + +-----------category name: person, frame name: 3 +are persons distinguished by action: YES + +-----------category name: person, frame name: 4 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 7 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0dab85b6d3 + +-----------category name: lizard, frame name: 2 +are lizards distinguished by action: YES + +-----------category name: lizard, frame name: 9 +are lizards distinguished by action: YES + +-----------category name: lizard, frame name: 16 +are lizards distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: lizard, frame name: 21 +are lizards distinguished by action: YES + +vid id: 0db5c427a5 + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +vid id: 0dbaf284f1 + +-----------category name: cat, frame name: 2 +are cats distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: cat, frame name: 9 +are cats distinguished by action: YES + +-----------category name: cat, frame name: 12 +are cats distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: cat, frame name: 15 +are cats distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +vid id: 0de4923598 + +Skipping others: Determined to be non-movable. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +vid id: 0df28a9101 + +-----------category name: turtle, frame name: 3 +are turtles distinguished by action: "NONE" + +-----------category name: turtle, frame name: 8 +are turtles distinguished by action: "NONE" + +-----------category name: turtle, frame name: 15 +are turtles distinguished by action: NONE + +-----------category name: turtle, frame name: 20 +are turtles distinguished by action: NONE + +vid id: 0e04f636c4 + +Skipping frog: There is single or no object. + +Skipping frog: There is single or no object. + +Skipping frog: There is single or no object. + +Skipping frog: There is single or no object. + +vid id: 0e05f0e232 + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +-----------category name: lizard, frame name: 32 +are lizards distinguished by action: "YES" + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 0e0930474b + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +-----------category name: person, frame name: 2 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 6 +are persons distinguished by action: "NONE" + +-----------category name: person, frame name: 10 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 12 +are persons distinguished by action: "NONE" + +vid id: 0e27472bea + +Skipping hand: Determined to be non-movable. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +vid id: 0e30020549 + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +vid id: 0e621feb6c + +-----------category name: lizard, frame name: 4 +are lizards distinguished by action: YES + +-----------category name: lizard, frame name: 15 +are lizards distinguished by action: YES + +-----------category name: lizard, frame name: 23 +are lizards distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: lizard, frame name: 26 +are lizards distinguished by action: YES + +Retrying caption generation... (1/3) +vid id: 0e803c7d73 + +Skipping knife: Determined to be non-movable. + +Skipping hand: Determined to be non-movable. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +vid id: 0e9ebe4e3c + +Skipping truck: There is single or no object. + +Skipping truck: There is single or no object. + +Skipping truck: There is single or no object. + +Skipping truck: There is single or no object. + +vid id: 0e9f2785ec + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping umbrella: Determined to be non-movable. + +Skipping umbrella: There is single or no object. + +Skipping umbrella: There is single or no object. + +Skipping umbrella: There is single or no object. + +Skipping umbrella: There is single or no object. + +vid id: 0ea68d418b + +Skipping others: Determined to be non-movable. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +vid id: 0eb403a222 + +Skipping knife: Determined to be non-movable. + +Skipping knife: There is single or no object. + +Skipping knife: There is single or no object. + +Skipping knife: There is single or no object. + +Skipping knife: There is single or no object. + +Skipping hand: Determined to be non-movable. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +vid id: 0ee92053d6 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 0eefca067f + +-----------category name: giant_panda, frame name: 3 +are giant_pandas distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: giant_panda, frame name: 9 +are giant_pandas distinguished by action: YES + +-----------category name: giant_panda, frame name: 14 +are giant_pandas distinguished by action: YES + +-----------category name: giant_panda, frame name: 20 +are giant_pandas distinguished by action: YES + +Retrying caption generation... (1/3) +vid id: 0f17fa6fcb + +-----------category name: duck, frame name: 2 +are ducks distinguished by action: NONE + +-----------category name: duck, frame name: 15 +are ducks distinguished by action: YES + +-----------category name: duck, frame name: 25 +are ducks distinguished by action: "NONE" + +-----------category name: duck, frame name: 31 +are ducks distinguished by action: NONE + +vid id: 0f1ac8e9a3 + +Skipping frog: There is single or no object. + +Skipping frog: There is single or no object. + +Skipping frog: There is single or no object. + +Skipping frog: There is single or no object. + +vid id: 0f202e9852 + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +vid id: 0f2ab8b1ff + +-----------category name: dolphin, frame name: 4 +are dolphins distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: dolphin, frame name: 11 +are dolphins distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: dolphin, frame name: 18 +are dolphins distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: dolphin, frame name: 29 +are dolphins distinguished by action: NONE + +vid id: 0f51a78756 + +Skipping sheep: There is single or no object. + +Skipping sheep: There is single or no object. + +Skipping sheep: There is single or no object. + +Skipping sheep: There is single or no object. + +vid id: 0f5fbe16b0 + +-----------category name: raccoon, frame name: 6 +are raccoons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: raccoon, frame name: 17 +are raccoons distinguished by action: YES + +-----------category name: raccoon, frame name: 22 +are raccoons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: raccoon, frame name: 33 +are raccoons distinguished by action: YES + +Retrying caption generation... (1/3) +vid id: 0f6072077b + +-----------category name: person, frame name: 4 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: person, frame name: 7 +are persons distinguished by action: YES + +-----------category name: person, frame name: 10 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: person, frame name: 16 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +vid id: 0f6b69b2f4 + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +vid id: 0f6c2163de + +Skipping snail: There is single or no object. + +Skipping snail: There is single or no object. + +Skipping snail: There is single or no object. + +Skipping snail: There is single or no object. + +vid id: 0f74ec5599 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 0f9683715b + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +vid id: 0fa7b59356 + +Skipping duck: There is single or no object. + +Skipping duck: There is single or no object. + +Skipping duck: There is single or no object. + +Skipping duck: There is single or no object. + +vid id: 0fb173695b + +Skipping paddle: Determined to be non-movable. + +Skipping paddle: There is single or no object. + +Skipping paddle: There is single or no object. + +Skipping paddle: There is single or no object. + +Skipping paddle: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping hat: Determined to be non-movable. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +vid id: 0fc958cde2 + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +vid id: 0fe7b1a621 + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +vid id: 0ffcdb491c + +-----------category name: person, frame name: 2 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 8 +are persons distinguished by action: "NONE" + +-----------category name: person, frame name: 10 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 14 +are persons distinguished by action: "NONE" + +vid id: 101caff7d4 + +-----------category name: giant_panda, frame name: 5 +are giant_pandas distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: giant_panda, frame name: 8 +are giant_pandas distinguished by action: NONE + +-----------category name: giant_panda, frame name: 10 +are giant_pandas distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: giant_panda, frame name: 17 +are giant_pandas distinguished by action: YES + +vid id: 1022fe8417 + +-----------category name: person, frame name: 2 +are persons distinguished by action: YES + +-----------category name: person, frame name: 8 +are persons distinguished by action: YES + +-----------category name: person, frame name: 12 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 17 +are persons distinguished by action: YES + +vid id: 1032e80b37 + +Skipping giraffe: There is single or no object. + +Skipping giraffe: There is single or no object. + +Skipping giraffe: There is single or no object. + +Skipping giraffe: There is single or no object. + +vid id: 103f501680 + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +vid id: 104e64565f + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +vid id: 104f1ab997 + +-----------category name: person, frame name: 2 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 6 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 9 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: person, frame name: 14 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +vid id: 106242403f + +-----------category name: person, frame name: 5 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 9 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 11 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 15 +are persons distinguished by action: NONE + +vid id: 10b31f5431 + +-----------category name: person, frame name: 6 +are persons distinguished by action: YES + +-----------category name: person, frame name: 11 +are persons distinguished by action: YES + +-----------category name: person, frame name: 18 +are persons distinguished by action: YES + +-----------category name: person, frame name: 26 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 10eced835e + +-----------category name: giant_panda, frame name: 3 +are giant_pandas distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: giant_panda, frame name: 5 +are giant_pandas distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: giant_panda, frame name: 8 +are giant_pandas distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Skipping giant_panda: There is single or no object. + +vid id: 110d26fa3a + +Skipping shark: There is single or no object. + +Skipping shark: There is single or no object. + +Skipping shark: There is single or no object. + +Skipping shark: There is single or no object. + +vid id: 1122c1d16a + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +-----------category name: parrot, frame name: 5 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: parrot, frame name: 11 +are parrots distinguished by action: YES + +-----------category name: parrot, frame name: 25 +are parrots distinguished by action: NONE + +-----------category name: parrot, frame name: 33 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +vid id: 1145b49a5f + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +vid id: 11485838c2 + +-----------category name: giraffe, frame name: 2 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: giraffe, frame name: 9 +are giraffes distinguished by action: YES + +-----------category name: giraffe, frame name: 12 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: giraffe, frame name: 16 +are giraffes distinguished by action: NONE + +vid id: 114e7676ec + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping surfboard: Determined to be non-movable. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +vid id: 1157472b95 + +-----------category name: parrot, frame name: 7 +are parrots distinguished by action: YES + +-----------category name: parrot, frame name: 10 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: parrot, frame name: 23 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: parrot, frame name: 27 +are parrots distinguished by action: NONE + +vid id: 115ee1072c + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +vid id: 1171141012 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +vid id: 117757b4b8 + +Skipping snail: There is single or no object. + +Skipping snail: There is single or no object. + +Skipping snail: There is single or no object. + +Skipping snail: There is single or no object. + +vid id: 1178932d2f + +-----------category name: person, frame name: 5 +are persons distinguished by action: "NONE" + +-----------category name: person, frame name: 11 +are persons distinguished by action: "NONE" + +-----------category name: person, frame name: 24 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 27 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +vid id: 117cc76bda + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +vid id: 1180cbf814 + +-----------category name: fish, frame name: 3 +are fishs distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: fish, frame name: 17 +are fishs distinguished by action: NONE + +-----------category name: fish, frame name: 19 +are fishs distinguished by action: NONE + +-----------category name: fish, frame name: 32 +are fishs distinguished by action: NONE + +vid id: 1187bbd0e3 + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +vid id: 1197e44b26 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 119cf20728 + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping hand: Determined to be non-movable. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +vid id: 119dd54871 + +-----------category name: lion, frame name: 5 +are lions distinguished by action: NONE + +-----------category name: lion, frame name: 10 +are lions distinguished by action: "NONE" + +-----------category name: lion, frame name: 19 +are lions distinguished by action: "NONE" + +-----------category name: lion, frame name: 24 +are lions distinguished by action: NONE + +vid id: 11a0c3b724 + +-----------category name: mouse, frame name: 4 +are mouses distinguished by action: "NONE" + +-----------category name: mouse, frame name: 7 +are mouses distinguished by action: NONE + +-----------category name: mouse, frame name: 10 +are mouses distinguished by action: YES + +-----------category name: mouse, frame name: 15 +are mouses distinguished by action: "NONE" + +vid id: 11a6ba8c94 + +-----------category name: person, frame name: 9 +are persons distinguished by action: YES + +-----------category name: person, frame name: 15 +are persons distinguished by action: YES + +-----------category name: person, frame name: 20 +are persons distinguished by action: YES + +-----------category name: person, frame name: 29 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 11c722a456 + +-----------category name: turtle, frame name: 2 +are turtles distinguished by action: NONE + +-----------category name: turtle, frame name: 15 +are turtles distinguished by action: NONE + +-----------category name: turtle, frame name: 19 +are turtles distinguished by action: YES + +-----------category name: turtle, frame name: 26 +are turtles distinguished by action: NONE + +vid id: 11cbcb0b4d + +Skipping zebra: There is single or no object. + +Skipping zebra: There is single or no object. + +Skipping zebra: There is single or no object. + +Skipping zebra: There is single or no object. + +vid id: 11ccf5e99d + +Skipping plant: Determined to be non-movable. + +Skipping plant: There is single or no object. + +Skipping plant: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 11ce6f452e + +-----------category name: person, frame name: 4 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 6 +are persons distinguished by action: YES + +-----------category name: person, frame name: 8 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: person, frame name: 12 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 11feabe596 + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +Skipping rabbit: There is single or no object. + +vid id: 120cb9514d + +-----------category name: person, frame name: 9 +are persons distinguished by action: "YES" + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 10 +are persons distinguished by action: "YES" + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 25 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: person, frame name: 27 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +vid id: 12156b25b3 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping surfboard: Determined to be non-movable. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +vid id: 122896672d + +Skipping others: Determined to be non-movable. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +Skipping others: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 1233ac8596 + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +vid id: 1239c87234 + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +vid id: 1250423f7c + +-----------category name: elephant, frame name: 3 +are elephants distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: elephant, frame name: 6 +are elephants distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping hat: Determined to be non-movable. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +vid id: 1257a1bc67 + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +Skipping snake: There is single or no object. + +vid id: 125d1b19dd + +-----------category name: giant_panda, frame name: 4 +are giant_pandas distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: giant_panda, frame name: 12 +are giant_pandas distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: giant_panda, frame name: 20 +are giant_pandas distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: giant_panda, frame name: 28 +are giant_pandas distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 126d203967 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 1295e19071 + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +vid id: 12ad198c54 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 12bddb2bcb + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping frisbee: Determined to be non-movable. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +vid id: 12ec9b93ee + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 12eebedc35 + +Skipping bird: There is single or no object. + +Skipping bird: There is single or no object. + +Skipping bird: There is single or no object. + +Skipping bird: There is single or no object. + +vid id: 132852e094 + +Skipping fox: There is single or no object. + +Skipping fox: There is single or no object. + +Skipping fox: There is single or no object. + +Skipping fox: There is single or no object. + +vid id: 1329409f2a + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +vid id: 13325cfa14 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping umbrella: Determined to be non-movable. + +Skipping umbrella: There is single or no object. + +Skipping umbrella: There is single or no object. + +Skipping umbrella: There is single or no object. + +Skipping umbrella: There is single or no object. + +vid id: 1336440745 + +Skipping mouse: There is single or no object. + +-----------category name: mouse, frame name: 14 +are mouses distinguished by action: "NONE" + +-----------category name: mouse, frame name: 23 +are mouses distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +Skipping mouse: There is single or no object. + +vid id: 134d06dbf9 + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +vid id: 135625b53d + +Skipping hand: Determined to be non-movable. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +vid id: 13870016f9 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +-----------category name: cow, frame name: 3 +are cows distinguished by action: NONE + +-----------category name: cow, frame name: 6 +are cows distinguished by action: "NONE" + +-----------category name: cow, frame name: 9 +are cows distinguished by action: "NONE" + +-----------category name: cow, frame name: 12 +are cows distinguished by action: NONE + +vid id: 13960b3c84 + +-----------category name: giraffe, frame name: 2 +are giraffes distinguished by action: YES + +-----------category name: giraffe, frame name: 7 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: giraffe, frame name: 12 +are giraffes distinguished by action: YES + +-----------category name: giraffe, frame name: 14 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 13adaad9d9 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 13ae097e20 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 13e3070469 + +-----------category name: zebra, frame name: 4 +are zebras distinguished by action: "NONE" + +-----------category name: zebra, frame name: 6 +are zebras distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: zebra, frame name: 13 +are zebras distinguished by action: NONE + +-----------category name: zebra, frame name: 14 +are zebras distinguished by action: NONE + +vid id: 13f6a8c20d + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +vid id: 1416925cf2 + +-----------category name: truck, frame name: 3 +are trucks distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: truck, frame name: 5 +are trucks distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: truck, frame name: 10 +are trucks distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: truck, frame name: 14 +are trucks distinguished by action: YES + +vid id: 142d2621f5 + +-----------category name: person, frame name: 2 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: person, frame name: 7 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: person, frame name: 8 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: person, frame name: 13 +are persons distinguished by action: YES + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +vid id: 145d5d7c03 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 145fdc3ac5 + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +vid id: 1471274fa7 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 14a6b5a139 + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +vid id: 14c21cea0d + +-----------category name: monkey, frame name: 8 +are monkeys distinguished by action: NONE + +-----------category name: monkey, frame name: 11 +are monkeys distinguished by action: "NONE" + +-----------category name: monkey, frame name: 24 +are monkeys distinguished by action: NONE + +-----------category name: monkey, frame name: 28 +are monkeys distinguished by action: YES + +Retrying caption generation... (1/3) +vid id: 14dae0dc93 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping umbrella: Determined to be non-movable. + +Skipping umbrella: There is single or no object. + +Skipping umbrella: There is single or no object. + +Skipping umbrella: There is single or no object. + +Skipping umbrella: There is single or no object. + +vid id: 14f9bd22b5 + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +vid id: 14fd28ae99 + +Skipping hand: Determined to be non-movable. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +vid id: 15097d5d4e + +Skipping hand: Determined to be non-movable. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +vid id: 150ea711f2 + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +vid id: 1514e3563f + +-----------category name: earless_seal, frame name: 4 +are earless_seals distinguished by action: I'm sorry, I can't assist with that. + +-----------category name: earless_seal, frame name: 15 +are earless_seals distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +Skipping earless_seal: There is single or no object. + +Skipping earless_seal: There is single or no object. + +vid id: 152aaa3a9e + +Skipping raccoon: There is single or no object. + +Skipping raccoon: There is single or no object. + +Skipping raccoon: There is single or no object. + +Skipping raccoon: There is single or no object. + +vid id: 152b7d3bd7 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 15617297cc + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping surfboard: Determined to be non-movable. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +vid id: 15abbe0c52 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 15d1fb3de5 + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +vid id: 15f67b0fab + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 161eb59aad + +Skipping giraffe: There is single or no object. + +Skipping giraffe: There is single or no object. + +Skipping giraffe: There is single or no object. + +Skipping giraffe: There is single or no object. + +-----------category name: cow, frame name: 5 +are cows distinguished by action: NONE + +-----------category name: cow, frame name: 9 +are cows distinguished by action: NONE + +-----------category name: cow, frame name: 10 +are cows distinguished by action: NONE + +-----------category name: cow, frame name: 15 +are cows distinguished by action: NONE + +vid id: 16288ea47f + +-----------category name: duck, frame name: 8 +are ducks distinguished by action: YES + +-----------category name: duck, frame name: 14 +are ducks distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: duck, frame name: 19 +are ducks distinguished by action: YES + +-----------category name: duck, frame name: 33 +are ducks distinguished by action: YES + +vid id: 164410ce62 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 165c3c8cd4 + +-----------category name: person, frame name: 5 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: person, frame name: 9 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 11 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: person, frame name: 14 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 165c42b41b + +-----------category name: person, frame name: 3 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 7 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 9 +are persons distinguished by action: NONE + +Skipping person: There is single or no object. + +-----------category name: motorbike, frame name: 3 +are motorbikes distinguished by action: NONE + +Skipping motorbike: There is single or no object. + +-----------category name: motorbike, frame name: 9 +are motorbikes distinguished by action: NONE + +Skipping motorbike: There is single or no object. + +vid id: 165ec9e22b + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 1669502269 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 16763cccbb + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +vid id: 16adde065e + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping cat: There is single or no object. + +Skipping hat: Determined to be non-movable. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +vid id: 16af445362 + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +vid id: 16afd538ad + +-----------category name: parrot, frame name: 5 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: parrot, frame name: 10 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: parrot, frame name: 18 +are parrots distinguished by action: "YES" + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: parrot, frame name: 26 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +vid id: 16c3fa4d5d + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +vid id: 16d1d65c27 + +Skipping monkey: There is single or no object. + +Skipping monkey: There is single or no object. + +Skipping monkey: There is single or no object. + +Skipping monkey: There is single or no object. + +vid id: 16e8599e94 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 16fe9fb444 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +vid id: 1705796b02 + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +vid id: 1724db7671 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 17418e81ea + +Skipping shark: There is single or no object. + +Skipping shark: There is single or no object. + +Skipping shark: There is single or no object. + +Skipping shark: There is single or no object. + +vid id: 175169edbb + +-----------category name: ape, frame name: 4 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: ape, frame name: 13 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: ape, frame name: 16 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: ape, frame name: 23 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 17622326fd + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +vid id: 17656bae77 + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +Skipping elephant: There is single or no object. + +vid id: 17b0d94172 + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +vid id: 17c220e4f6 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 17c7bcd146 + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +vid id: 17cb4afe89 + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +Skipping tiger: There is single or no object. + +vid id: 17cd79a434 + +Skipping squirrel: There is single or no object. + +Skipping squirrel: There is single or no object. + +Skipping squirrel: There is single or no object. + +Skipping squirrel: There is single or no object. + +vid id: 17d18604c3 + +Skipping plant: Determined to be non-movable. + +Skipping plant: There is single or no object. + +Skipping plant: There is single or no object. + +Skipping plant: There is single or no object. + +Skipping plant: There is single or no object. + +-----------category name: person, frame name: 4 +are persons distinguished by action: YES + +-----------category name: person, frame name: 8 +are persons distinguished by action: YES + +-----------category name: person, frame name: 10 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: person, frame name: 14 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 17d8ca1a37 + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 17e33f4330 + +Skipping monkey: There is single or no object. + +Skipping monkey: There is single or no object. + +Skipping monkey: There is single or no object. + +Skipping monkey: There is single or no object. + +vid id: 17f7a6d805 + +Skipping snail: There is single or no object. + +Skipping snail: There is single or no object. + +Skipping snail: There is single or no object. + +Skipping snail: There is single or no object. + +vid id: 180abc8378 + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +Skipping owl: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 183ba3d652 + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping hat: Determined to be non-movable. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +vid id: 185bf64702 + +Skipping zebra: There is single or no object. + +-----------category name: zebra, frame name: 6 +are zebras distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +Skipping zebra: There is single or no object. + +-----------category name: zebra, frame name: 15 +are zebras distinguished by action: YES + +vid id: 18913cc690 + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +vid id: 1892651815 + +Skipping camel: There is single or no object. + +Skipping camel: There is single or no object. + +Skipping camel: There is single or no object. + +Skipping camel: There is single or no object. + +vid id: 189ac8208a + +-----------category name: giraffe, frame name: 2 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: giraffe, frame name: 6 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: giraffe, frame name: 8 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: giraffe, frame name: 11 +are giraffes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 189b44e92c + +Skipping zebra: There is single or no object. + +Skipping zebra: There is single or no object. + +Skipping zebra: There is single or no object. + +Skipping zebra: There is single or no object. + +vid id: 18ac264b76 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 18b245ab49 + +-----------category name: penguin, frame name: 4 +are penguins distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: penguin, frame name: 5 +are penguins distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: penguin, frame name: 10 +are penguins distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: penguin, frame name: 13 +are penguins distinguished by action: YES + +Retrying caption generation... (1/3) +vid id: 18b5cebc34 + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +Skipping mouse: There is single or no object. + +vid id: 18bad52083 + +-----------category name: parrot, frame name: 2 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: parrot, frame name: 11 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: parrot, frame name: 18 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: parrot, frame name: 31 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 18bb5144d5 + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +vid id: 18c6f205c5 + +-----------category name: person, frame name: 4 +are persons distinguished by action: YES + +-----------category name: person, frame name: 6 +are persons distinguished by action: YES + +-----------category name: person, frame name: 10 +are persons distinguished by action: YES + +-----------category name: person, frame name: 14 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +vid id: 1903f9ea15 + +-----------category name: bird, frame name: 4 +are birds distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: bird, frame name: 6 +are birds distinguished by action: NONE + +-----------category name: bird, frame name: 10 +are birds distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: bird, frame name: 14 +are birds distinguished by action: NONE + +vid id: 1917b209f2 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +-----------category name: cow, frame name: 4 +are cows distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: cow, frame name: 7 +are cows distinguished by action: "YES" + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: cow, frame name: 8 +are cows distinguished by action: "YES" + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: cow, frame name: 16 +are cows distinguished by action: "YES" + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +Skipping horse: There is single or no object. + +Skipping horse: There is single or no object. + +Skipping horse: There is single or no object. + +Skipping horse: There is single or no object. + +vid id: 191e74c01d + +Skipping deer: There is single or no object. + +Skipping deer: There is single or no object. + +Skipping deer: There is single or no object. + +Skipping deer: There is single or no object. + +vid id: 19367bb94e + +-----------category name: fish, frame name: 9 +are fishs distinguished by action: NONE + +-----------category name: fish, frame name: 17 +are fishs distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: fish, frame name: 24 +are fishs distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: fish, frame name: 26 +are fishs distinguished by action: NONE + +vid id: 193ffaa217 + +-----------category name: person, frame name: 2 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 7 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 8 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 13 +are persons distinguished by action: NONE + +vid id: 19696b67d3 + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +vid id: 197f3ab6f3 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 1981e763cc + +-----------category name: sheep, frame name: 2 +are sheeps distinguished by action: YES + +-----------category name: sheep, frame name: 17 +are sheeps distinguished by action: NONE + +-----------category name: sheep, frame name: 20 +are sheeps distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: sheep, frame name: 29 +are sheeps distinguished by action: YES + +Retrying caption generation... (1/3) +vid id: 198afe39ae + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping surfboard: Determined to be non-movable. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +vid id: 19a6e62b9b + +Skipping monkey: There is single or no object. + +Skipping monkey: There is single or no object. + +Skipping monkey: There is single or no object. + +-----------category name: monkey, frame name: 24 +are monkeys distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 19b60d5335 + +Skipping hand: Determined to be non-movable. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hedgehog: There is single or no object. + +Skipping hedgehog: There is single or no object. + +Skipping hedgehog: There is single or no object. + +Skipping hedgehog: There is single or no object. + +vid id: 19c00c11f9 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping surfboard: Determined to be non-movable. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +vid id: 19e061eb88 + +-----------category name: boat, frame name: 4 +are boats distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: boat, frame name: 5 +are boats distinguished by action: NONE + +-----------category name: boat, frame name: 10 +are boats distinguished by action: NONE + +-----------category name: boat, frame name: 15 +are boats distinguished by action: NONE + +vid id: 19e8bc6178 + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +vid id: 19ee80dac6 + +-----------category name: person, frame name: 3 +are persons distinguished by action: YES + +-----------category name: person, frame name: 17 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping surfboard: Determined to be non-movable. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +Skipping surfboard: There is single or no object. + +vid id: 1a25a9170a + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +Skipping cow: There is single or no object. + +-----------category name: person, frame name: 6 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: person, frame name: 16 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 23 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: person, frame name: 26 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 1a359a6c1a + +Skipping sheep: There is single or no object. + +Skipping sheep: There is single or no object. + +Skipping sheep: There is single or no object. + +Skipping sheep: There is single or no object. + +vid id: 1a3e87c566 + +Skipping frog: There is single or no object. + +Skipping frog: There is single or no object. + +Skipping frog: There is single or no object. + +Skipping frog: There is single or no object. + +vid id: 1a5fe06b00 + +Skipping bus: There is single or no object. + +Skipping bus: There is single or no object. + +Skipping bus: There is single or no object. + +Skipping bus: There is single or no object. + +vid id: 1a6c0fbd1e + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 1a6f3b5a4b + +Skipping bike: Determined to be non-movable. + +Skipping bike: There is single or no object. + +Skipping bike: There is single or no object. + +Skipping bike: There is single or no object. + +Skipping bike: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping sedan: There is single or no object. + +Skipping hand: Determined to be non-movable. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +vid id: 1a8afbad92 + +-----------category name: zebra, frame name: 3 +are zebras distinguished by action: "NONE" + +-----------category name: zebra, frame name: 5 +are zebras distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: zebra, frame name: 10 +are zebras distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Skipping zebra: There is single or no object. + +vid id: 1a8bdc5842 + +-----------category name: parrot, frame name: 3 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: parrot, frame name: 11 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: parrot, frame name: 14 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: parrot, frame name: 23 +are parrots distinguished by action: YES + +Retrying caption generation... (1/3) +vid id: 1a95752aca + +-----------category name: duck, frame name: 4 +are ducks distinguished by action: NONE + +-----------category name: duck, frame name: 10 +are ducks distinguished by action: "NONE" + +-----------category name: duck, frame name: 14 +are ducks distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: duck, frame name: 27 +are ducks distinguished by action: "NONE" + +vid id: 1a9c131cb7 + +-----------category name: ape, frame name: 6 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: ape, frame name: 17 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: ape, frame name: 20 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: ape, frame name: 27 +are apes distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 1aa3da3ee3 + +-----------category name: sheep, frame name: 2 +are sheeps distinguished by action: "NONE" + +-----------category name: sheep, frame name: 9 +are sheeps distinguished by action: "NONE" + +-----------category name: sheep, frame name: 15 +are sheeps distinguished by action: "NONE" + +-----------category name: sheep, frame name: 25 +are sheeps distinguished by action: "NONE" + +vid id: 1ab27ec7ea + +Skipping deer: There is single or no object. + +Skipping deer: There is single or no object. + +Skipping deer: There is single or no object. + +Skipping deer: There is single or no object. + +vid id: 1abf16d21d + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +vid id: 1acd0f993b + +Skipping frisbee: Determined to be non-movable. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +Skipping frisbee: There is single or no object. + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +Skipping dog: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +vid id: 1ad202e499 + +-----------category name: lizard, frame name: 6 +are lizards distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: lizard, frame name: 14 +are lizards distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: lizard, frame name: 22 +are lizards distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: lizard, frame name: 31 +are lizards distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 1af8d2395d + +Skipping parachute: Determined to be non-movable. + +Skipping parachute: There is single or no object. + +Skipping parachute: There is single or no object. + +Skipping parachute: There is single or no object. + +Skipping parachute: There is single or no object. + +-----------category name: person, frame name: 6 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: person, frame name: 13 +are persons distinguished by action: YES + +-----------category name: person, frame name: 20 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 28 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +vid id: 1afd39a1fa + +Skipping hand: Determined to be non-movable. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping hand: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +Skipping motorbike: There is single or no object. + +vid id: 1b2d31306f + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +Skipping lizard: There is single or no object. + +vid id: 1b3fa67f0e + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +Skipping airplane: There is single or no object. + +vid id: 1b43fa74b4 + +-----------category name: owl, frame name: 7 +are owls distinguished by action: NONE + +-----------category name: owl, frame name: 12 +are owls distinguished by action: "NONE" + +-----------category name: owl, frame name: 19 +are owls distinguished by action: NONE + +-----------category name: owl, frame name: 20 +are owls distinguished by action: NONE + +vid id: 1b73ea9fc2 + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +Skipping parrot: There is single or no object. + +vid id: 1b7e8bb255 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping hat: Determined to be non-movable. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +Skipping hat: There is single or no object. + +vid id: 1b8680f8cd + +Skipping tennis_racket: Determined to be non-movable. + +Skipping tennis_racket: There is single or no object. + +Skipping tennis_racket: There is single or no object. + +Skipping tennis_racket: There is single or no object. + +Skipping tennis_racket: There is single or no object. + +-----------category name: person, frame name: 6 +are persons distinguished by action: YES + +-----------category name: person, frame name: 9 +are persons distinguished by action: YES + +-----------category name: person, frame name: 14 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 21 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 1b883843c0 + +-----------category name: person, frame name: 4 +are persons distinguished by action: NONE + +-----------category name: person, frame name: 7 +are persons distinguished by action: YES + +-----------category name: person, frame name: 9 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: person, frame name: 14 +are persons distinguished by action: "NONE" + +vid id: 1b8898785b + +-----------category name: monkey, frame name: 9 +are monkeys distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: monkey, frame name: 14 +are monkeys distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: monkey, frame name: 22 +are monkeys distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Skipping monkey: There is single or no object. + +vid id: 1b88ba1aa4 + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +Skipping giant_panda: There is single or no object. + +vid id: 1b96a498e5 + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +Skipping ape: There is single or no object. + +vid id: 1bbc4c274f + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +Skipping fish: There is single or no object. + +vid id: 1bd87fe9ab + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +vid id: 1c4090c75b + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +Skipping whale: There is single or no object. + +vid id: 1c41934f84 + +-----------category name: elephant, frame name: 5 +are elephants distinguished by action: YES + +-----------category name: elephant, frame name: 6 +are elephants distinguished by action: "NONE" + +-----------category name: elephant, frame name: 13 +are elephants distinguished by action: NONE + +-----------category name: elephant, frame name: 16 +are elephants distinguished by action: NONE + +vid id: 1c72b04b56 + +Skipping lion: There is single or no object. + +Skipping lion: There is single or no object. + +Skipping lion: There is single or no object. + +Skipping lion: There is single or no object. + +vid id: 1c87955a3a + +Skipping crocodile: There is single or no object. + +Skipping crocodile: There is single or no object. + +Skipping crocodile: There is single or no object. + +Skipping crocodile: There is single or no object. + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +Skipping turtle: There is single or no object. + +vid id: 1c9f9eb792 + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping person: There is single or no object. + +Skipping skateboard: Determined to be non-movable. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +Skipping skateboard: There is single or no object. + +vid id: 1ca240fede + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +Skipping train: There is single or no object. + +vid id: 1ca5673803 + +-----------category name: person, frame name: 8 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: person, frame name: 14 +are persons distinguished by action: YES + +-----------category name: person, frame name: 21 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: person, frame name: 27 +are persons distinguished by action: YES + +Skipping tennis_racket: Determined to be non-movable. + +Skipping tennis_racket: There is single or no object. + +Skipping tennis_racket: There is single or no object. + +Skipping tennis_racket: There is single or no object. + +Skipping tennis_racket: There is single or no object. + +vid id: 1cada35274 + +Skipping duck: There is single or no object. + +Skipping duck: There is single or no object. + +Skipping duck: There is single or no object. + +Skipping duck: There is single or no object. + +vid id: 1cb44b920d + +-----------category name: eagle, frame name: 5 +are eagles distinguished by action: YES + +Retrying caption generation... (1/3) +-----------category name: eagle, frame name: 13 +are eagles distinguished by action: YES + +-----------category name: eagle, frame name: 18 +are eagles distinguished by action: YES + +-----------category name: eagle, frame name: 27 +are eagles distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 1cd10e62be + +Skipping leopard: There is single or no object. + +Skipping leopard: There is single or no object. + +Skipping leopard: There is single or no object. + +Skipping leopard: There is single or no object. + +vid id: 1d3087d5e5 + +-----------category name: fish, frame name: 5 +are fishs distinguished by action: NONE + +-----------category name: fish, frame name: 11 +are fishs distinguished by action: NONE + +-----------category name: fish, frame name: 23 +are fishs distinguished by action: NONE + +-----------category name: fish, frame name: 33 +are fishs distinguished by action: NONE + +vid id: 1d3685150a + +Skipping sign: Determined to be non-movable. + +Skipping sign: There is single or no object. + +Skipping sign: There is single or no object. + +Skipping sign: There is single or no object. + +Skipping sign: There is single or no object. + +-----------category name: person, frame name: 2 +are persons distinguished by action: YES + +-----------category name: person, frame name: 6 +are persons distinguished by action: "YES" + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: person, frame name: 8 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 15 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +vid id: 1d6ff083aa + +-----------category name: person, frame name: 2 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +-----------category name: person, frame name: 9 +are persons distinguished by action: YES + +-----------category name: person, frame name: 10 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +-----------category name: person, frame name: 17 +are persons distinguished by action: YES + +Retrying caption generation... (1/3) +Retrying caption generation... (2/3) +Retrying caption generation... (3/3) +Max retries reached. Caption generation failed. +Finished! diff --git a/models/ops/functions/__init__.py b/models/ops/functions/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8a2197bda3199aa32cafc5b9d396479609853dd2 --- /dev/null +++ b/models/ops/functions/__init__.py @@ -0,0 +1,10 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +from .ms_deform_attn_func import MSDeformAttnFunction + diff --git a/models/ops/setup.py b/models/ops/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..a0131bc21cf1b45b90fcf174e2c53e4c08e9c641 --- /dev/null +++ b/models/ops/setup.py @@ -0,0 +1,71 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +import os +import glob + +import torch + +from torch.utils.cpp_extension import CUDA_HOME +from torch.utils.cpp_extension import CppExtension +from torch.utils.cpp_extension import CUDAExtension + +from setuptools import find_packages +from setuptools import setup + +requirements = ["torch", "torchvision"] + +def get_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + extensions_dir = os.path.join(this_dir, "src") + + main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) + source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) + + sources = main_file + source_cpu + extension = CppExtension + extra_compile_args = {"cxx": []} + define_macros = [] + + if torch.cuda.is_available() and CUDA_HOME is not None: + extension = CUDAExtension + sources += source_cuda + define_macros += [("WITH_CUDA", None)] + extra_compile_args["nvcc"] = [ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ] + else: + raise NotImplementedError('Cuda is not availabel') + + sources = [os.path.join(extensions_dir, s) for s in sources] + include_dirs = [extensions_dir] + ext_modules = [ + extension( + "MultiScaleDeformableAttention", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + return ext_modules + +setup( + name="MultiScaleDeformableAttention", + version="1.0", + author="Weijie Su", + url="https://github.com/fundamentalvision/Deformable-DETR", + description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", + packages=find_packages(exclude=("configs", "tests",)), + ext_modules=get_extensions(), + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, +) diff --git a/models/ops/src/cpu/ms_deform_attn_cpu.h b/models/ops/src/cpu/ms_deform_attn_cpu.h new file mode 100644 index 0000000000000000000000000000000000000000..81b7b58a3d9502bbb684dc84687a526dedf94cae --- /dev/null +++ b/models/ops/src/cpu/ms_deform_attn_cpu.h @@ -0,0 +1,33 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +#pragma once +#include + +at::Tensor +ms_deform_attn_cpu_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step); + +std::vector +ms_deform_attn_cpu_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step); + + diff --git a/models/ops/test.py b/models/ops/test.py new file mode 100644 index 0000000000000000000000000000000000000000..8dbf6d5547d131f01a8c5c28b76557bd27a9334b --- /dev/null +++ b/models/ops/test.py @@ -0,0 +1,89 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import time +import torch +import torch.nn as nn +from torch.autograd import gradcheck + +from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch + + +N, M, D = 1, 2, 2 +Lq, L, P = 2, 2, 2 +shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() +level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) +S = sum([(H*W).item() for H, W in shapes]) + + +torch.manual_seed(3) + + +@torch.no_grad() +def check_forward_equal_with_pytorch_double(): + value = torch.rand(N, S, M, D).cuda() * 0.01 + sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() + attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 + attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) + im2col_step = 2 + output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() + output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() + fwdok = torch.allclose(output_cuda, output_pytorch) + max_abs_err = (output_cuda - output_pytorch).abs().max() + max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() + + print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + +@torch.no_grad() +def check_forward_equal_with_pytorch_float(): + value = torch.rand(N, S, M, D).cuda() * 0.01 + sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() + attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 + attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) + im2col_step = 2 + output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() + output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() + fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) + max_abs_err = (output_cuda - output_pytorch).abs().max() + max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() + + print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + +def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): + + value = torch.rand(N, S, M, channels).cuda() * 0.01 + sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() + attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 + attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) + im2col_step = 2 + func = MSDeformAttnFunction.apply + + value.requires_grad = grad_value + sampling_locations.requires_grad = grad_sampling_loc + attention_weights.requires_grad = grad_attn_weight + + gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) + + print(f'* {gradok} check_gradient_numerical(D={channels})') + + +if __name__ == '__main__': + check_forward_equal_with_pytorch_double() + check_forward_equal_with_pytorch_float() + + for channels in [30, 32, 64, 71, 1025, 2048, 3096]: + check_gradient_numerical(channels, True, True, True) + + + diff --git a/models/referformer.py b/models/referformer.py new file mode 100644 index 0000000000000000000000000000000000000000..c2f57096b288ece0f53afc7d31dc5dc465cdebfb --- /dev/null +++ b/models/referformer.py @@ -0,0 +1,639 @@ +""" +ReferFormer model class. +Modified from DETR (https://github.com/facebookresearch/detr) +""" +import torch +import torch.nn.functional as F +from torch import nn + +import os +import math +from util import box_ops +from util.misc import (NestedTensor, nested_tensor_from_tensor_list, + nested_tensor_from_videos_list, + accuracy, get_world_size, interpolate, + is_dist_avail_and_initialized, inverse_sigmoid) + +from .position_encoding import PositionEmbeddingSine1D +from .backbone import build_backbone +from .deformable_transformer import build_deforamble_transformer +from .segmentation import CrossModalFPNDecoder, VisionLanguageFusionModule +from .matcher import build_matcher +from .criterion import SetCriterion +from .postprocessors import build_postprocessors + +from transformers import BertTokenizer, BertModel, RobertaModel, RobertaTokenizerFast + +import copy +from einops import rearrange, repeat + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +os.environ["TOKENIZERS_PARALLELISM"] = "false" # this disables a huggingface tokenizer warning (printed every epoch) + +class ReferFormer(nn.Module): + """ This is the ReferFormer module that performs referring video object detection """ + def __init__(self, backbone, transformer, num_classes, num_queries, num_feature_levels, + num_frames, mask_dim, dim_feedforward, + controller_layers, dynamic_mask_channels, + aux_loss=False, with_box_refine=False, two_stage=False, + freeze_text_encoder=False, rel_coord=True): + """ Initializes the model. + Parameters: + backbone: torch module of the backbone to be used. See backbone.py + transformer: torch module of the transformer architecture. See transformer.py + num_classes: number of object classes + num_queries: number of object queries, ie detection slot. This is the maximal number of objects + ReferFormer can detect in a video. For ytvos, we recommend 5 queries for each frame. + num_frames: number of clip frames + mask_dim: dynamic conv inter layer channel number. + dim_feedforward: vision-language fusion module ffn channel number. + dynamic_mask_channels: the mask feature output channel number. + aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. + """ + super().__init__() + self.num_queries = num_queries + self.transformer = transformer + hidden_dim = transformer.d_model + self.hidden_dim = hidden_dim + self.class_embed = nn.Linear(hidden_dim, num_classes) + self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) + self.num_feature_levels = num_feature_levels + + # Build Transformer + # NOTE: different deformable detr, the query_embed out channels is + # hidden_dim instead of hidden_dim * 2 + # This is because, the input to the decoder is text embedding feature + self.query_embed = nn.Embedding(num_queries, hidden_dim) + + # follow deformable-detr, we use the last three stages of backbone + if num_feature_levels > 1: + num_backbone_outs = len(backbone.strides[-3:]) + input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = backbone.num_channels[-3:][_] + input_proj_list.append(nn.Sequential( + nn.Conv2d(in_channels, hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim), + )) + for _ in range(num_feature_levels - num_backbone_outs): # downsample 2x + input_proj_list.append(nn.Sequential( + nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, hidden_dim), + )) + in_channels = hidden_dim + self.input_proj = nn.ModuleList(input_proj_list) + else: + self.input_proj = nn.ModuleList([ + nn.Sequential( + nn.Conv2d(backbone.num_channels[-3:][0], hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim), + )]) + + self.num_frames = num_frames + self.mask_dim = mask_dim + self.backbone = backbone + self.aux_loss = aux_loss + self.with_box_refine = with_box_refine + assert two_stage == False, "args.two_stage must be false!" + + # initialization + prior_prob = 0.01 + bias_value = -math.log((1 - prior_prob) / prior_prob) + self.class_embed.bias.data = torch.ones(num_classes) * bias_value + nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) + nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) + for proj in self.input_proj: + nn.init.xavier_uniform_(proj[0].weight, gain=1) + nn.init.constant_(proj[0].bias, 0) + + num_pred = transformer.decoder.num_layers + if with_box_refine: + self.class_embed = _get_clones(self.class_embed, num_pred) + self.bbox_embed = _get_clones(self.bbox_embed, num_pred) + nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) + # hack implementation for iterative bounding box refinement + self.transformer.decoder.bbox_embed = self.bbox_embed + else: + nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) + self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) + self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) + self.transformer.decoder.bbox_embed = None + + # Build Text Encoder + # self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased') + # self.text_encoder = BertModel.from_pretrained('bert-base-cased') + self.tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base') + self.text_encoder = RobertaModel.from_pretrained('roberta-base') + + if freeze_text_encoder: + for p in self.text_encoder.parameters(): + p.requires_grad_(False) + + # resize the bert output channel to transformer d_model + self.resizer = FeatureResizer( + input_feat_size=768, + output_feat_size=hidden_dim, + dropout=0.1, + ) + + self.fusion_module = VisionLanguageFusionModule(d_model=hidden_dim, nhead=8) + self.text_pos = PositionEmbeddingSine1D(hidden_dim, normalize=True) + + # Build FPN Decoder + self.rel_coord = rel_coord + feature_channels = [self.backbone.num_channels[0]] + 3 * [hidden_dim] + self.pixel_decoder = CrossModalFPNDecoder(feature_channels=feature_channels, conv_dim=hidden_dim, + mask_dim=mask_dim, dim_feedforward=dim_feedforward, norm="GN") + + # Build Dynamic Conv + self.controller_layers = controller_layers + self.in_channels = mask_dim + self.dynamic_mask_channels = dynamic_mask_channels + self.mask_out_stride = 4 + self.mask_feat_stride = 4 + + weight_nums, bias_nums = [], [] + for l in range(self.controller_layers): + if l == 0: + if self.rel_coord: + weight_nums.append((self.in_channels + 2) * self.dynamic_mask_channels) + else: + weight_nums.append(self.in_channels * self.dynamic_mask_channels) + bias_nums.append(self.dynamic_mask_channels) + elif l == self.controller_layers - 1: + weight_nums.append(self.dynamic_mask_channels * 1) # output layer c -> 1 + bias_nums.append(1) + else: + weight_nums.append(self.dynamic_mask_channels * self.dynamic_mask_channels) + bias_nums.append(self.dynamic_mask_channels) + + self.weight_nums = weight_nums + self.bias_nums = bias_nums + self.num_gen_params = sum(weight_nums) + sum(bias_nums) + + self.controller = MLP(hidden_dim, hidden_dim, self.num_gen_params, 3) + for layer in self.controller.layers: + nn.init.zeros_(layer.bias) + nn.init.xavier_uniform_(layer.weight) + + + def forward(self, samples: NestedTensor, captions, targets): + """ The forward expects a NestedTensor, which consists of: + - samples.tensors: image sequences, of shape [num_frames x 3 x H x W] + - samples.mask: a binary mask of shape [num_frames x H x W], containing 1 on padded pixels + - captions: list[str] + - targets: list[dict] + + It returns a dict with the following elements: + - "pred_masks": Shape = [batch_size x num_queries x out_h x out_w] + + - "pred_logits": the classification logits (including no-object) for all queries. + Shape= [batch_size x num_queries x num_classes] + - "pred_boxes": The normalized boxes coordinates for all queries, represented as + (center_x, center_y, height, width). These values are normalized in [0, 1], + relative to the size of each individual image (disregarding possible padding). + See PostProcess for information on how to retrieve the unnormalized bounding box. + - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of + dictionnaries containing the two above keys for each decoder layer. + """ + # Backbone + if not isinstance(samples, NestedTensor): + samples = nested_tensor_from_videos_list(samples) + + # features (list[NestedTensor]): res2 -> res5, shape of tensors is [B*T, Ci, Hi, Wi] + # pos (list[Tensor]): shape of [B*T, C, Hi, Wi] + features, pos = self.backbone(samples) + + b = len(captions) + t = pos[0].shape[0] // b + + # For A2D-Sentences and JHMDB-Sentencs dataset, only one frame is annotated for a clip + if 'valid_indices' in targets[0]: + valid_indices = torch.tensor([i * t + target['valid_indices'] for i, target in enumerate(targets)]).to(pos[0].device) + for feature in features: + feature.tensors = feature.tensors.index_select(0, valid_indices) + feature.mask = feature.mask.index_select(0, valid_indices) + for i, p in enumerate(pos): + pos[i] = p.index_select(0, valid_indices) + samples.mask = samples.mask.index_select(0, valid_indices) + # t: num_frames -> 1 + t = 1 + + text_features, text_sentence_features = self.forward_text(captions, device=pos[0].device) + + # prepare vision and text features for transformer + srcs = [] + masks = [] + poses = [] + + text_pos = self.text_pos(text_features).permute(2, 0, 1) # [length, batch_size, c] + text_word_features, text_word_masks = text_features.decompose() + text_word_features = text_word_features.permute(1, 0, 2) # [length, batch_size, c] + + # Follow Deformable-DETR, we use the last three stages outputs from backbone + for l, (feat, pos_l) in enumerate(zip(features[-3:], pos[-3:])): + src, mask = feat.decompose() + src_proj_l = self.input_proj[l](src) + n, c, h, w = src_proj_l.shape + + # vision language early-fusion + src_proj_l = rearrange(src_proj_l, '(b t) c h w -> (t h w) b c', b=b, t=t) + src_proj_l = self.fusion_module(tgt=src_proj_l, + memory=text_word_features, + memory_key_padding_mask=text_word_masks, + pos=text_pos, + query_pos=None + ) + src_proj_l = rearrange(src_proj_l, '(t h w) b c -> (b t) c h w', t=t, h=h, w=w) + + srcs.append(src_proj_l) + masks.append(mask) + poses.append(pos_l) + assert mask is not None + + if self.num_feature_levels > (len(features) - 1): + _len_srcs = len(features) - 1 # fpn level + for l in range(_len_srcs, self.num_feature_levels): + if l == _len_srcs: + src = self.input_proj[l](features[-1].tensors) + else: + src = self.input_proj[l](srcs[-1]) + m = samples.mask + mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] + pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype) + n, c, h, w = src.shape + + # vision language early-fusion + src = rearrange(src, '(b t) c h w -> (t h w) b c', b=b, t=t) + src = self.fusion_module(tgt=src, + memory=text_word_features, + memory_key_padding_mask=text_word_masks, + pos=text_pos, + query_pos=None + ) + src = rearrange(src, '(t h w) b c -> (b t) c h w', t=t, h=h, w=w) + + srcs.append(src) + masks.append(mask) + poses.append(pos_l) + + # Transformer + query_embeds = self.query_embed.weight # [num_queries, c] + text_embed = repeat(text_sentence_features, 'b c -> b t q c', t=t, q=self.num_queries) + hs, memory, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact, inter_samples = \ + self.transformer(srcs, text_embed, masks, poses, query_embeds) + # hs: [l, batch_size*time, num_queries_per_frame, c] + # memory: list[Tensor], shape of tensor is [batch_size*time, c, hi, wi] + # init_reference: [batch_size*time, num_queries_per_frame, 2] + # inter_references: [l, batch_size*time, num_queries_per_frame, 4] + + out = {} + # prediction + outputs_classes = [] + outputs_coords = [] + for lvl in range(hs.shape[0]): + if lvl == 0: + reference = init_reference + else: + reference = inter_references[lvl - 1] + reference = inverse_sigmoid(reference) + outputs_class = self.class_embed[lvl](hs[lvl]) + tmp = self.bbox_embed[lvl](hs[lvl]) + if reference.shape[-1] == 4: + tmp += reference + else: + assert reference.shape[-1] == 2 + tmp[..., :2] += reference + outputs_coord = tmp.sigmoid() # cxcywh, range in [0,1] + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + outputs_class = torch.stack(outputs_classes) + outputs_coord = torch.stack(outputs_coords) + # rearrange + outputs_class = rearrange(outputs_class, 'l (b t) q k -> l b t q k', b=b, t=t) + outputs_coord = rearrange(outputs_coord, 'l (b t) q n -> l b t q n', b=b, t=t) + out['pred_logits'] = outputs_class[-1] # [batch_size, time, num_queries_per_frame, num_classes] + out['pred_boxes'] = outputs_coord[-1] # [batch_size, time, num_queries_per_frame, 4] + + # Segmentation + mask_features = self.pixel_decoder(features, text_features, pos, memory, nf=t) # [batch_size*time, c, out_h, out_w] + mask_features = rearrange(mask_features, '(b t) c h w -> b t c h w', b=b, t=t) + + # dynamic conv + outputs_seg_masks = [] + for lvl in range(hs.shape[0]): + dynamic_mask_head_params = self.controller(hs[lvl]) # [batch_size*time, num_queries_per_frame, num_params] + dynamic_mask_head_params = rearrange(dynamic_mask_head_params, '(b t) q n -> b (t q) n', b=b, t=t) + lvl_references = inter_references[lvl, ..., :2] + lvl_references = rearrange(lvl_references, '(b t) q n -> b (t q) n', b=b, t=t) + outputs_seg_mask = self.dynamic_mask_with_coords(mask_features, dynamic_mask_head_params, lvl_references, targets) + outputs_seg_mask = rearrange(outputs_seg_mask, 'b (t q) h w -> b t q h w', t=t) + outputs_seg_masks.append(outputs_seg_mask) + out['pred_masks'] = outputs_seg_masks[-1] # [batch_size, time, num_queries_per_frame, out_h, out_w] + + if self.aux_loss: + out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord, outputs_seg_masks) + + if not self.training: + # for visualization + inter_references = inter_references[-2, :, :, :2] # [batch_size*time, num_queries_per_frame, 2] + inter_references = rearrange(inter_references, '(b t) q n -> b t q n', b=b, t=t) + out['reference_points'] = inter_references # the reference points of last layer input + return out + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord, outputs_seg_masks): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{"pred_logits": a, "pred_boxes": b, "pred_masks": c} + for a, b, c in zip(outputs_class[:-1], outputs_coord[:-1], outputs_seg_masks[:-1])] + + def forward_text(self, captions, device): + if isinstance(captions[0], str): + tokenized = self.tokenizer.batch_encode_plus(captions, padding="longest", return_tensors="pt").to(device) + encoded_text = self.text_encoder(**tokenized) + # encoded_text.last_hidden_state: [batch_size, length, 768] + # encoded_text.pooler_output: [batch_size, 768] + text_attention_mask = tokenized.attention_mask.ne(1).bool() + # text_attention_mask: [batch_size, length] + + text_features = encoded_text.last_hidden_state + text_features = self.resizer(text_features) + text_masks = text_attention_mask + text_features = NestedTensor(text_features, text_masks) # NestedTensor + + text_sentence_features = encoded_text.pooler_output + text_sentence_features = self.resizer(text_sentence_features) + else: + raise ValueError("Please mask sure the caption is a list of string") + return text_features, text_sentence_features + + def dynamic_mask_with_coords(self, mask_features, mask_head_params, reference_points, targets): + """ + Add the relative coordinates to the mask_features channel dimension, + and perform dynamic mask conv. + + Args: + mask_features: [batch_size, time, c, h, w] + mask_head_params: [batch_size, time * num_queries_per_frame, num_params] + reference_points: [batch_size, time * num_queries_per_frame, 2], cxcy + targets (list[dict]): length is batch size + we need the key 'size' for computing location. + Return: + outputs_seg_mask: [batch_size, time * num_queries_per_frame, h, w] + """ + device = mask_features.device + b, t, c, h, w = mask_features.shape + # this is the total query number in all frames + _, num_queries = reference_points.shape[:2] + q = num_queries // t # num_queries_per_frame + + # prepare reference points in image size (the size is input size to the model) + new_reference_points = [] + for i in range(b): + img_h, img_w = targets[i]['size'] + scale_f = torch.stack([img_w, img_h], dim=0) + tmp_reference_points = reference_points[i] * scale_f[None, :] + new_reference_points.append(tmp_reference_points) + new_reference_points = torch.stack(new_reference_points, dim=0) + # [batch_size, time * num_queries_per_frame, 2], in image size + reference_points = new_reference_points + + # prepare the mask features + if self.rel_coord: + reference_points = rearrange(reference_points, 'b (t q) n -> b t q n', t=t, q=q) + locations = compute_locations(h, w, device=device, stride=self.mask_feat_stride) + relative_coords = reference_points.reshape(b, t, q, 1, 1, 2) - \ + locations.reshape(1, 1, 1, h, w, 2) # [batch_size, time, num_queries_per_frame, h, w, 2] + relative_coords = relative_coords.permute(0, 1, 2, 5, 3, 4) # [batch_size, time, num_queries_per_frame, 2, h, w] + + # concat features + mask_features = repeat(mask_features, 'b t c h w -> b t q c h w', q=q) # [batch_size, time, num_queries_per_frame, c, h, w] + mask_features = torch.cat([mask_features, relative_coords], dim=3) + else: + mask_features = repeat(mask_features, 'b t c h w -> b t q c h w', q=q) # [batch_size, time, num_queries_per_frame, c, h, w] + mask_features = mask_features.reshape(1, -1, h, w) + + # parse dynamic params + mask_head_params = mask_head_params.flatten(0, 1) + weights, biases = parse_dynamic_params( + mask_head_params, self.dynamic_mask_channels, + self.weight_nums, self.bias_nums + ) + + # dynamic mask conv + mask_logits = self.mask_heads_forward(mask_features, weights, biases, mask_head_params.shape[0]) + mask_logits = mask_logits.reshape(-1, 1, h, w) + + # upsample predicted masks + assert self.mask_feat_stride >= self.mask_out_stride + assert self.mask_feat_stride % self.mask_out_stride == 0 + + mask_logits = aligned_bilinear(mask_logits, int(self.mask_feat_stride / self.mask_out_stride)) + mask_logits = mask_logits.reshape(b, num_queries, mask_logits.shape[-2], mask_logits.shape[-1]) + + return mask_logits # [batch_size, time * num_queries_per_frame, h, w] + + def mask_heads_forward(self, features, weights, biases, num_insts): + ''' + :param features + :param weights: [w0, w1, ...] + :param bias: [b0, b1, ...] + :return: + ''' + assert features.dim() == 4 + n_layers = len(weights) + x = features + for i, (w, b) in enumerate(zip(weights, biases)): + x = F.conv2d( + x, w, bias=b, + stride=1, padding=0, + groups=num_insts + ) + if i < n_layers - 1: + x = F.relu(x) + return x + + +def parse_dynamic_params(params, channels, weight_nums, bias_nums): + assert params.dim() == 2 + assert len(weight_nums) == len(bias_nums) + assert params.size(1) == sum(weight_nums) + sum(bias_nums) + + num_insts = params.size(0) + num_layers = len(weight_nums) + + params_splits = list(torch.split_with_sizes(params, weight_nums + bias_nums, dim=1)) + + weight_splits = params_splits[:num_layers] + bias_splits = params_splits[num_layers:] + + for l in range(num_layers): + if l < num_layers - 1: + # out_channels x in_channels x 1 x 1 + weight_splits[l] = weight_splits[l].reshape(num_insts * channels, -1, 1, 1) + bias_splits[l] = bias_splits[l].reshape(num_insts * channels) + else: + # out_channels x in_channels x 1 x 1 + weight_splits[l] = weight_splits[l].reshape(num_insts * 1, -1, 1, 1) + bias_splits[l] = bias_splits[l].reshape(num_insts) + + return weight_splits, bias_splits + +def aligned_bilinear(tensor, factor): + assert tensor.dim() == 4 + assert factor >= 1 + assert int(factor) == factor + + if factor == 1: + return tensor + + h, w = tensor.size()[2:] + tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate") + oh = factor * h + 1 + ow = factor * w + 1 + tensor = F.interpolate( + tensor, size=(oh, ow), + mode='bilinear', + align_corners=True + ) + tensor = F.pad( + tensor, pad=(factor // 2, 0, factor // 2, 0), + mode="replicate" + ) + + return tensor[:, :, :oh - 1, :ow - 1] + + +def compute_locations(h, w, device, stride=1): + shifts_x = torch.arange( + 0, w * stride, step=stride, + dtype=torch.float32, device=device) + + shifts_y = torch.arange( + 0, h * stride, step=stride, + dtype=torch.float32, device=device) + + shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) + shift_x = shift_x.reshape(-1) + shift_y = shift_y.reshape(-1) + locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2 + return locations + + + +class MLP(nn.Module): + """ Very simple multi-layer perceptron (also called FFN)""" + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + +class FeatureResizer(nn.Module): + """ + This class takes as input a set of embeddings of dimension C1 and outputs a set of + embedding of dimension C2, after a linear transformation, dropout and normalization (LN). + """ + + def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True): + super().__init__() + self.do_ln = do_ln + # Object feature encoding + self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True) + self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12) + self.dropout = nn.Dropout(dropout) + + def forward(self, encoder_features): + x = self.fc(encoder_features) + if self.do_ln: + x = self.layer_norm(x) + output = self.dropout(x) + return output + + +def build(args): + if args.binary: + num_classes = 1 + else: + if args.dataset_file == 'ytvos': + num_classes = 65 + elif args.dataset_file == 'davis': + num_classes = 78 + elif args.dataset_file == 'a2d' or args.dataset_file == 'jhmdb': + num_classes = 1 + else: + num_classes = 91 # for coco + device = torch.device(args.device) + + # backbone + if 'video_swin' in args.backbone: + from .video_swin_transformer import build_video_swin_backbone + backbone = build_video_swin_backbone(args) + elif 'swin' in args.backbone: + from .swin_transformer import build_swin_backbone + backbone = build_swin_backbone(args) + else: + backbone = build_backbone(args) + + transformer = build_deforamble_transformer(args) + + model = ReferFormer( + backbone, + transformer, + num_classes=num_classes, + num_queries=args.num_queries, + num_feature_levels=args.num_feature_levels, + num_frames=args.num_frames, + mask_dim=args.mask_dim, + dim_feedforward=args.dim_feedforward, + controller_layers=args.controller_layers, + dynamic_mask_channels=args.dynamic_mask_channels, + aux_loss=args.aux_loss, + with_box_refine=args.with_box_refine, + two_stage=args.two_stage, + freeze_text_encoder=args.freeze_text_encoder, + rel_coord=args.rel_coord + ) + matcher = build_matcher(args) + weight_dict = {} + weight_dict['loss_ce'] = args.cls_loss_coef + weight_dict['loss_bbox'] = args.bbox_loss_coef + weight_dict['loss_giou'] = args.giou_loss_coef + if args.masks: # always true + weight_dict['loss_mask'] = args.mask_loss_coef + weight_dict['loss_dice'] = args.dice_loss_coef + # TODO this is a hack + if args.aux_loss: + aux_weight_dict = {} + for i in range(args.dec_layers - 1): + aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + + losses = ['labels', 'boxes'] + if args.masks: + losses += ['masks'] + criterion = SetCriterion( + num_classes, + matcher=matcher, + weight_dict=weight_dict, + eos_coef=args.eos_coef, + losses=losses, + focal_alpha=args.focal_alpha) + criterion.to(device) + + # postprocessors, this is used for coco pretrain but not for rvos + postprocessors = build_postprocessors(args, args.dataset_file) + return model, criterion, postprocessors + + + diff --git a/my_datasets/__init__.py b/my_datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..724c8d0616a65de7b215a3c5c5517f9af9c8f84e --- /dev/null +++ b/my_datasets/__init__.py @@ -0,0 +1,40 @@ +import torch.utils.data +import torchvision + +from .ytvos import build as build_ytvos +from .ytvos_ref import build as build_ytvos_ref +from .davis import build as build_davis +from .a2d import build as build_a2d +from .jhmdb import build as build_jhmdb +from .refexp import build as build_refexp +from .concat_dataset import build as build_joint + + +def get_coco_api_from_dataset(dataset): + for _ in range(10): + # if isinstance(dataset, torchvision.datasets.CocoDetection): + # break + if isinstance(dataset, torch.utils.data.Subset): + dataset = dataset.dataset + if isinstance(dataset, torchvision.datasets.CocoDetection): + return dataset.coco + + +def build_dataset(dataset_file: str, image_set: str, args): + if dataset_file == 'ytvos': + return build_ytvos(image_set, args) + if dataset_file == 'ytvos_ref': + return build_ytvos_ref(image_set, args) + if dataset_file == 'davis': + return build_davis(image_set, args) + if dataset_file == 'a2d': + return build_a2d(image_set, args) + if dataset_file == 'jhmdb': + return build_jhmdb(image_set, args) + # for pretraining + if dataset_file == "refcoco" or dataset_file == "refcoco+" or dataset_file == "refcocog": + return build_refexp(dataset_file, image_set, args) + # for joint training of refcoco and ytvos + if dataset_file == 'joint': + return build_joint(image_set, args) + raise ValueError(f'dataset {dataset_file} not supported') diff --git a/my_datasets/__pycache__/__init__.cpython-310.pyc b/my_datasets/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb9a6d221b0027d7de99ee3a7b11c335322f51b3 Binary files /dev/null and b/my_datasets/__pycache__/__init__.cpython-310.pyc differ diff --git a/my_datasets/__pycache__/a2d.cpython-310.pyc b/my_datasets/__pycache__/a2d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f92920d90dd633ae4b9cc2493f6c584034aef25 Binary files /dev/null and b/my_datasets/__pycache__/a2d.cpython-310.pyc differ diff --git a/my_datasets/__pycache__/a2d.cpython-39.pyc b/my_datasets/__pycache__/a2d.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15866b20f69694461f5ff478dadcd2778984aaa7 Binary files /dev/null and b/my_datasets/__pycache__/a2d.cpython-39.pyc differ diff --git a/my_datasets/__pycache__/jhmdb.cpython-39.pyc b/my_datasets/__pycache__/jhmdb.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7599ad5e2c66e8fd1ed994dec584f1ba0e56e616 Binary files /dev/null and b/my_datasets/__pycache__/jhmdb.cpython-39.pyc differ diff --git a/my_datasets/__pycache__/refexp2seq.cpython-310.pyc b/my_datasets/__pycache__/refexp2seq.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b69155a3c51ee5da7fb774f13fb1836c6491ffb2 Binary files /dev/null and b/my_datasets/__pycache__/refexp2seq.cpython-310.pyc differ diff --git a/my_datasets/__pycache__/transforms_image.cpython-310.pyc b/my_datasets/__pycache__/transforms_image.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b15347ade71f0e6f7412500a3e9286646097d628 Binary files /dev/null and b/my_datasets/__pycache__/transforms_image.cpython-310.pyc differ diff --git a/my_datasets/__pycache__/ytvos.cpython-310.pyc b/my_datasets/__pycache__/ytvos.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0973ad22eb6e6b9d156c4cb8098b6aac68929815 Binary files /dev/null and b/my_datasets/__pycache__/ytvos.cpython-310.pyc differ diff --git a/my_datasets/a2d_eval.py b/my_datasets/a2d_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..fba7e86b9da2fbc2e2600b740d3a49f448b51ef2 --- /dev/null +++ b/my_datasets/a2d_eval.py @@ -0,0 +1,96 @@ +""" +This file contains implementations for the precision@k and IoU (mean, overall) evaluation metrics. +copy-paste from https://github.com/mttr2021/MTTR/blob/main/metrics.py +""" +import torch +from tqdm import tqdm +from pycocotools.coco import COCO +from pycocotools.mask import decode +import numpy as np + +from torchvision.ops.boxes import box_area + +def compute_bbox_iou(boxes1: torch.Tensor, boxes2: torch.Tensor): + # both boxes: xyxy + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + wh = (rb - lt).clamp(min=0) # [N,M,2] + inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = (inter+1e-6) / (union+1e-6) + return iou, inter, union + +def compute_mask_iou(outputs: torch.Tensor, labels: torch.Tensor, EPS=1e-6): + outputs = outputs.int() + intersection = (outputs & labels).float().sum((1, 2)) # Will be zero if Truth=0 or Prediction=0 + union = (outputs | labels).float().sum((1, 2)) # Will be zero if both are 0 + iou = (intersection + EPS) / (union + EPS) # EPS is used to avoid division by zero + return iou, intersection, union + +# mask +def calculate_precision_at_k_and_iou_metrics(coco_gt: COCO, coco_pred: COCO): + print('evaluating mask precision@k & iou metrics...') + counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]} + total_intersection_area = 0 + total_union_area = 0 + ious_list = [] + for instance in tqdm(coco_gt.imgs.keys()): # each image_id contains exactly one instance + gt_annot = coco_gt.imgToAnns[instance][0] + gt_mask = decode(gt_annot['segmentation']) + pred_annots = coco_pred.imgToAnns[instance] + pred_annot = sorted(pred_annots, key=lambda a: a['score'])[-1] # choose pred with highest score + pred_mask = decode(pred_annot['segmentation']) + iou, intersection, union = compute_mask_iou(torch.tensor(pred_mask).unsqueeze(0), + torch.tensor(gt_mask).unsqueeze(0)) + iou, intersection, union = iou.item(), intersection.item(), union.item() + for iou_threshold in counters_by_iou.keys(): + if iou > iou_threshold: + counters_by_iou[iou_threshold] += 1 + total_intersection_area += intersection + total_union_area += union + ious_list.append(iou) + num_samples = len(ious_list) + precision_at_k = np.array(list(counters_by_iou.values())) / num_samples + overall_iou = total_intersection_area / total_union_area + mean_iou = np.mean(ious_list) + return precision_at_k, overall_iou, mean_iou + +# bbox +def calculate_bbox_precision_at_k_and_iou_metrics(coco_gt: COCO, coco_pred: COCO): + print('evaluating bbox precision@k & iou metrics...') + counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]} + total_intersection_area = 0 + total_union_area = 0 + ious_list = [] + for instance in tqdm(coco_gt.imgs.keys()): # each image_id contains exactly one instance + gt_annot = coco_gt.imgToAnns[instance][0] + gt_bbox = gt_annot['bbox'] # xywh + gt_bbox = [ + gt_bbox[0], + gt_bbox[1], + gt_bbox[2] + gt_bbox[0], + gt_bbox[3] + gt_bbox[1], + ] + pred_annots = coco_pred.imgToAnns[instance] + pred_annot = sorted(pred_annots, key=lambda a: a['score'])[-1] # choose pred with highest score + pred_bbox = pred_annot['bbox'] # xyxy + iou, intersection, union = compute_bbox_iou(torch.tensor(pred_bbox).unsqueeze(0), + torch.tensor(gt_bbox).unsqueeze(0)) + iou, intersection, union = iou.item(), intersection.item(), union.item() + for iou_threshold in counters_by_iou.keys(): + if iou > iou_threshold: + counters_by_iou[iou_threshold] += 1 + total_intersection_area += intersection + total_union_area += union + ious_list.append(iou) + num_samples = len(ious_list) + precision_at_k = np.array(list(counters_by_iou.values())) / num_samples + overall_iou = total_intersection_area / total_union_area + mean_iou = np.mean(ious_list) + return precision_at_k, overall_iou, mean_iou diff --git a/my_datasets/categories.py b/my_datasets/categories.py new file mode 100644 index 0000000000000000000000000000000000000000..f2cf7030bd86c40f3c7807f5712689acbfb7ded0 --- /dev/null +++ b/my_datasets/categories.py @@ -0,0 +1,54 @@ +# ------------------------------------------------------------------------------------------------------------------- +# 1. Ref-Youtube-VOS +ytvos_category_dict = { + 'airplane': 0, 'ape': 1, 'bear': 2, 'bike': 3, 'bird': 4, 'boat': 5, 'bucket': 6, 'bus': 7, 'camel': 8, 'cat': 9, + 'cow': 10, 'crocodile': 11, 'deer': 12, 'dog': 13, 'dolphin': 14, 'duck': 15, 'eagle': 16, 'earless_seal': 17, + 'elephant': 18, 'fish': 19, 'fox': 20, 'frisbee': 21, 'frog': 22, 'giant_panda': 23, 'giraffe': 24, 'hand': 25, + 'hat': 26, 'hedgehog': 27, 'horse': 28, 'knife': 29, 'leopard': 30, 'lion': 31, 'lizard': 32, 'monkey': 33, + 'motorbike': 34, 'mouse': 35, 'others': 36, 'owl': 37, 'paddle': 38, 'parachute': 39, 'parrot': 40, 'penguin': 41, + 'person': 42, 'plant': 43, 'rabbit': 44, 'raccoon': 45, 'sedan': 46, 'shark': 47, 'sheep': 48, 'sign': 49, + 'skateboard': 50, 'snail': 51, 'snake': 52, 'snowboard': 53, 'squirrel': 54, 'surfboard': 55, 'tennis_racket': 56, + 'tiger': 57, 'toilet': 58, 'train': 59, 'truck': 60, 'turtle': 61, 'umbrella': 62, 'whale': 63, 'zebra': 64 +} + +ytvos_category_list = [ + 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bucket', 'bus', 'camel', 'cat', 'cow', 'crocodile', + 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frisbee', 'frog', + 'giant_panda', 'giraffe', 'hand', 'hat', 'hedgehog', 'horse', 'knife', 'leopard', 'lion', 'lizard', + 'monkey', 'motorbike', 'mouse', 'others', 'owl', 'paddle', 'parachute', 'parrot', 'penguin', 'person', + 'plant', 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'sign', 'skateboard', 'snail', 'snake', 'snowboard', + 'squirrel', 'surfboard', 'tennis_racket', 'tiger', 'toilet', 'train', 'truck', 'turtle', 'umbrella', 'whale', 'zebra' +] + +ytvos_category_valid_list = [ + 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', + 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', + 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', + 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', + 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake', + 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra' +] + +# ------------------------------------------------------------------------------------------------------------------- +# 2. Ref-DAVIS17 +davis_category_dict = { + 'airplane': 0, 'backpack': 1, 'ball': 2, 'bear': 3, 'bicycle': 4, 'bird': 5, 'boat': 6, 'bottle': 7, 'box': 8, 'bus': 9, + 'camel': 10, 'car': 11, 'carriage': 12, 'cat': 13, 'cellphone': 14, 'chamaleon': 15, 'cow': 16, 'deer': 17, 'dog': 18, + 'dolphin': 19, 'drone': 20, 'elephant': 21, 'excavator': 22, 'fish': 23, 'goat': 24, 'golf cart': 25, 'golf club': 26, + 'grass': 27, 'guitar': 28, 'gun': 29, 'helicopter': 30, 'horse': 31, 'hoverboard': 32, 'kart': 33, 'key': 34, 'kite': 35, + 'koala': 36, 'leash': 37, 'lion': 38, 'lock': 39, 'mask': 40, 'microphone': 41, 'monkey': 42, 'motorcycle': 43, 'oar': 44, + 'paper': 45, 'paraglide': 46, 'person': 47, 'pig': 48, 'pole': 49, 'potted plant': 50, 'puck': 51, 'rack': 52, 'rhino': 53, + 'rope': 54, 'sail': 55, 'scale': 56, 'scooter': 57, 'selfie stick': 58, 'sheep': 59, 'skateboard': 60, 'ski': 61, 'ski poles': 62, + 'snake': 63, 'snowboard': 64, 'stick': 65, 'stroller': 66, 'surfboard': 67, 'swing': 68, 'tennis racket': 69, 'tractor': 70, + 'trailer': 71, 'train': 72, 'truck': 73, 'turtle': 74, 'varanus': 75, 'violin': 76, 'wheelchair': 77 +} + +davis_category_list = [ + 'airplane', 'backpack', 'ball', 'bear', 'bicycle', 'bird', 'boat', 'bottle', 'box', 'bus', 'camel', 'car', 'carriage', + 'cat', 'cellphone', 'chamaleon', 'cow', 'deer', 'dog', 'dolphin', 'drone', 'elephant', 'excavator', 'fish', 'goat', + 'golf cart', 'golf club', 'grass', 'guitar', 'gun', 'helicopter', 'horse', 'hoverboard', 'kart', 'key', 'kite', 'koala', + 'leash', 'lion', 'lock', 'mask', 'microphone', 'monkey', 'motorcycle', 'oar', 'paper', 'paraglide', 'person', 'pig', + 'pole', 'potted plant', 'puck', 'rack', 'rhino', 'rope', 'sail', 'scale', 'scooter', 'selfie stick', 'sheep', 'skateboard', + 'ski', 'ski poles', 'snake', 'snowboard', 'stick', 'stroller', 'surfboard', 'swing', 'tennis racket', 'tractor', 'trailer', + 'train', 'truck', 'turtle', 'varanus', 'violin', 'wheelchair' +] \ No newline at end of file diff --git a/my_datasets/coco.py b/my_datasets/coco.py new file mode 100644 index 0000000000000000000000000000000000000000..d05d260c71ec942b5d3053a47823ed9fe03e3314 --- /dev/null +++ b/my_datasets/coco.py @@ -0,0 +1,157 @@ +""" +COCO dataset which returns image_id for evaluation. + +Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py +""" +from pathlib import Path + +import torch +import torch.utils.data +import torchvision +from pycocotools import mask as coco_mask + +import datasets.transforms as T + + +class CocoDetection(torchvision.datasets.CocoDetection): + def __init__(self, img_folder, ann_file, transforms, return_masks): + super(CocoDetection, self).__init__(img_folder, ann_file) + self._transforms = transforms + self.prepare = ConvertCocoPolysToMask(return_masks) + + def __getitem__(self, idx): + img, target = super(CocoDetection, self).__getitem__(idx) + image_id = self.ids[idx] + target = {'image_id': image_id, 'annotations': target} + + img, target = self.prepare(img, target) + if self._transforms is not None: + img, target = self._transforms(img, target) + return img, target + + +def convert_coco_poly_to_mask(segmentations, height, width): + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8) + mask = mask.any(dim=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, dim=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8) + return masks + + +class ConvertCocoPolysToMask(object): + def __init__(self, return_masks=False): + self.return_masks = return_masks + + def __call__(self, image, target): + w, h = image.size + + image_id = target["image_id"] + image_id = torch.tensor([image_id]) + + anno = target["annotations"] + + anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0] + + boxes = [obj["bbox"] for obj in anno] + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + + classes = [obj["category_id"] for obj in anno] + classes = torch.tensor(classes, dtype=torch.int64) + + if self.return_masks: + segmentations = [obj["segmentation"] for obj in anno] + masks = convert_coco_poly_to_mask(segmentations, h, w) + + keypoints = None + if anno and "keypoints" in anno[0]: + keypoints = [obj["keypoints"] for obj in anno] + keypoints = torch.as_tensor(keypoints, dtype=torch.float32) + num_keypoints = keypoints.shape[0] + if num_keypoints: + keypoints = keypoints.view(num_keypoints, -1, 3) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + boxes = boxes[keep] + classes = classes[keep] + if self.return_masks: + masks = masks[keep] + if keypoints is not None: + keypoints = keypoints[keep] + + target = {} + target["boxes"] = boxes + target["labels"] = classes + if self.return_masks: + target["masks"] = masks + target["image_id"] = image_id + if keypoints is not None: + target["keypoints"] = keypoints + + # for conversion to coco api + area = torch.tensor([obj["area"] for obj in anno]) + iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) + target["area"] = area[keep] + target["iscrowd"] = iscrowd[keep] + + target["orig_size"] = torch.as_tensor([int(h), int(w)]) + target["size"] = torch.as_tensor([int(h), int(w)]) + + return image, target + + +def make_coco_transforms(image_set): + + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.RandomSelect( + T.RandomResize(scales, max_size=1333), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=1333), + ]) + ), + normalize, + ]) + + if image_set == 'val': + return T.Compose([ + T.RandomResize([800], max_size=1333), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.coco_path) + assert root.exists(), f'provided COCO path {root} does not exist' + mode = 'instances' + PATHS = { + "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'), + "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'), + } + img_folder, ann_file = PATHS[image_set] + dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks) + return dataset diff --git a/my_datasets/refexp.py b/my_datasets/refexp.py new file mode 100644 index 0000000000000000000000000000000000000000..ee4cac86e0bedb8d1ef3e2e1aea3239715856b6d --- /dev/null +++ b/my_datasets/refexp.py @@ -0,0 +1,179 @@ +# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +COCO dataset which returns image_id for evaluation. +Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py +""" +from pathlib import Path + +import torch +import torch.utils.data +import torchvision +from pycocotools import mask as coco_mask + +import datasets.transforms_image as T + + +class ModulatedDetection(torchvision.datasets.CocoDetection): + def __init__(self, img_folder, ann_file, transforms, return_masks): + super(ModulatedDetection, self).__init__(img_folder, ann_file) + self._transforms = transforms + self.prepare = ConvertCocoPolysToMask(return_masks) + + def __getitem__(self, idx): + instance_check = False + while not instance_check: + img, target = super(ModulatedDetection, self).__getitem__(idx) + image_id = self.ids[idx] + coco_img = self.coco.loadImgs(image_id)[0] + caption = coco_img["caption"] + dataset_name = coco_img["dataset_name"] if "dataset_name" in coco_img else None + target = {"image_id": image_id, "annotations": target, "caption": caption} + img, target = self.prepare(img, target) + if self._transforms is not None: + img, target = self._transforms(img, target) + target["dataset_name"] = dataset_name + for extra_key in ["sentence_id", "original_img_id", "original_id", "task_id"]: + if extra_key in coco_img: + target[extra_key] = coco_img[extra_key] # box xyxy -> cxcywh + # FIXME: handle "valid", since some box may be removed due to random crop + target["valid"] = torch.tensor([1]) if len(target["area"]) != 0 else torch.tensor([0]) + + if torch.any(target['valid'] == 1): # at leatst one instance + instance_check = True + else: + import random + idx = random.randint(0, self.__len__() - 1) + return img.unsqueeze(0), target + # return img: [1, 3, H, W], the first dimension means T = 1. + + +def convert_coco_poly_to_mask(segmentations, height, width): + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8) + mask = mask.any(dim=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, dim=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8) + return masks + + +class ConvertCocoPolysToMask(object): + def __init__(self, return_masks=False): + self.return_masks = return_masks + + def __call__(self, image, target): + w, h = image.size + + image_id = target["image_id"] + image_id = torch.tensor([image_id]) + + anno = target["annotations"] + caption = target["caption"] if "caption" in target else None + + anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0] + + boxes = [obj["bbox"] for obj in anno] + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] # xminyminwh -> xyxy + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + + classes = [obj["category_id"] for obj in anno] + classes = torch.tensor(classes, dtype=torch.int64) + + if self.return_masks: + segmentations = [obj["segmentation"] for obj in anno] + masks = convert_coco_poly_to_mask(segmentations, h, w) + + # keep the valid boxes + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + boxes = boxes[keep] + classes = classes[keep] + if self.return_masks: + masks = masks[keep] + + target = {} + target["boxes"] = boxes + target["labels"] = classes + if caption is not None: + target["caption"] = caption + if self.return_masks: + target["masks"] = masks + target["image_id"] = image_id + + # for conversion to coco api + area = torch.tensor([obj["area"] for obj in anno]) + iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) + target["area"] = area[keep] + target["iscrowd"] = iscrowd[keep] + target["valid"] = torch.tensor([1]) + target["orig_size"] = torch.as_tensor([int(h), int(w)]) + target["size"] = torch.as_tensor([int(h), int(w)]) + return image, target + + +def make_coco_transforms(image_set, cautious): + + normalize = T.Compose([T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) + + scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + final_scales = [296, 328, 360, 392, 416, 448, 480, 512] + + max_size = 800 + if image_set == "train": + horizontal = [] if cautious else [T.RandomHorizontalFlip()] + return T.Compose( + horizontal + + [ + T.RandomSelect( + T.RandomResize(scales, max_size=max_size), + T.Compose( + [ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600, respect_boxes=cautious), + T.RandomResize(final_scales, max_size=640), + ] + ), + ), + normalize, + ] + ) + + if image_set == "val": + return T.Compose( + [ + T.RandomResize([360], max_size=640), + normalize, + ] + ) + + raise ValueError(f"unknown {image_set}") + + +def build(dataset_file, image_set, args): + root = Path(args.coco_path) + assert root.exists(), f"provided COCO path {root} does not exist" + mode = "instances" + dataset = dataset_file + PATHS = { + "train": (root / "train2014", root / dataset / f"{mode}_{dataset}_train.json"), + "val": (root / "train2014", root / dataset / f"{mode}_{dataset}_val.json"), + } + + img_folder, ann_file = PATHS[image_set] + dataset = ModulatedDetection( + img_folder, + ann_file, + transforms=make_coco_transforms(image_set, False), + return_masks=args.masks, + ) + return dataset \ No newline at end of file diff --git a/my_datasets/refexp2seq.py b/my_datasets/refexp2seq.py new file mode 100644 index 0000000000000000000000000000000000000000..3bc50f1850f2992b8749f41b3c7b9de93250371d --- /dev/null +++ b/my_datasets/refexp2seq.py @@ -0,0 +1,229 @@ +# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# For building refcoco, refcoco+, refcocog datasets +""" +COCO dataset which returns image_id for evaluation. +Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py +""" +from pathlib import Path + +import torch +import torch.utils.data +import torchvision +from pycocotools import mask as coco_mask + +import random +import numpy as np +from PIL import Image + +import datasets.transforms_video as T +from datasets.image_to_seq_augmenter import ImageToSeqAugmenter + +from util.box_ops import masks_to_boxes + + +class ModulatedDetection(torchvision.datasets.CocoDetection): + def __init__(self, img_folder, ann_file, num_frames, transforms, return_masks): + super(ModulatedDetection, self).__init__(img_folder, ann_file) + self._transforms = transforms + self.prepare = ConvertCocoPolysToMask(return_masks) + self.num_frames = num_frames + self.augmenter = ImageToSeqAugmenter(perspective=True, affine=True, motion_blur=True, + rotation_range=(-20, 20), perspective_magnitude=0.08, + hue_saturation_range=(-5, 5), brightness_range=(-40, 40), + motion_blur_prob=0.25, motion_blur_kernel_sizes=(9, 11), + translate_range=(-0.1, 0.1)) + + def apply_random_sequence_shuffle(self, images, instance_masks): + perm = list(range(self.num_frames)) + random.shuffle(perm) + images = [images[i] for i in perm] + instance_masks = [instance_masks[i] for i in perm] + return images, instance_masks + + def __getitem__(self, idx): + instance_check = False + while not instance_check: + img, target = super(ModulatedDetection, self).__getitem__(idx) + image_id = self.ids[idx] + coco_img = self.coco.loadImgs(image_id)[0] + caption = coco_img["caption"] + dataset_name = coco_img["dataset_name"] if "dataset_name" in coco_img else None + target = {"image_id": image_id, "annotations": target, "caption": caption} + img, target = self.prepare(img, target) + + # for a image, we rotate it to form a clip + seq_images, seq_instance_masks = [img], [target['masks'].numpy()] + numpy_masks = target['masks'].numpy() # [1, H, W] + + numinst = len(numpy_masks) + assert numinst == 1 + for t in range(self.num_frames - 1): + im_trafo, instance_masks_trafo = self.augmenter(np.asarray(img), numpy_masks) + im_trafo = Image.fromarray(np.uint8(im_trafo)) + seq_images.append(im_trafo) + seq_instance_masks.append(np.stack(instance_masks_trafo, axis=0)) + seq_images, seq_instance_masks = self.apply_random_sequence_shuffle(seq_images, seq_instance_masks) + output_inst_masks = [] + for inst_i in range(numinst): + inst_i_mask = [] + for f_i in range(self.num_frames): + inst_i_mask.append(seq_instance_masks[f_i][inst_i]) + output_inst_masks.append( np.stack(inst_i_mask, axis=0) ) + + output_inst_masks = torch.from_numpy( np.stack(output_inst_masks, axis=0) ) + target['masks'] = output_inst_masks.flatten(0,1) # [t, h, w] + target['boxes'] = masks_to_boxes(target['masks']) # [t, 4] + target['labels'] = target['labels'].repeat(self.num_frames) # [t,] + + if self._transforms is not None: + img, target = self._transforms(seq_images, target) + target["dataset_name"] = dataset_name + for extra_key in ["sentence_id", "original_img_id", "original_id", "task_id"]: + if extra_key in coco_img: + target[extra_key] = coco_img[extra_key] # box xyxy -> cxcywh + # FIXME: handle "valid", since some box may be removed due to random crop + if torch.any(target['valid'] == 1): # at leatst one instance + instance_check = True + else: + idx = random.randint(0, self.__len__() - 1) + + # set the gt box of empty mask to [0, 0, 0, 0] + for inst_id in range(len(target['boxes'])): + if target['masks'][inst_id].max()<1: + target['boxes'][inst_id] = torch.zeros(4).to(target['boxes'][inst_id]) + + target['boxes']=target['boxes'].clamp(1e-6) + return torch.stack(img,dim=0), target + + +def convert_coco_poly_to_mask(segmentations, height, width): + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8) + mask = mask.any(dim=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, dim=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8) + return masks + + +class ConvertCocoPolysToMask(object): + def __init__(self, return_masks=False): + self.return_masks = return_masks + + def __call__(self, image, target): + w, h = image.size + + image_id = target["image_id"] + image_id = torch.tensor([image_id]) + + anno = target["annotations"] + caption = target["caption"] if "caption" in target else None + + anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0] + + boxes = [obj["bbox"] for obj in anno] + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] # xminyminwh -> xyxy + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + + classes = [obj["category_id"] for obj in anno] + classes = torch.tensor(classes, dtype=torch.int64) + + if self.return_masks: + segmentations = [obj["segmentation"] for obj in anno] + masks = convert_coco_poly_to_mask(segmentations, h, w) + + # keep the valid boxes + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + boxes = boxes[keep] + classes = classes[keep] + if self.return_masks: + masks = masks[keep] + + target = {} + target["boxes"] = boxes + target["labels"] = classes + if caption is not None: + target["caption"] = caption + if self.return_masks: + target["masks"] = masks + target["image_id"] = image_id + + # for conversion to coco api + area = torch.tensor([obj["area"] for obj in anno]) + iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) + target["area"] = area[keep] + target["iscrowd"] = iscrowd[keep] + target["valid"] = torch.tensor([1]) + target["orig_size"] = torch.as_tensor([int(h), int(w)]) + target["size"] = torch.as_tensor([int(h), int(w)]) + return image, target + + +def make_coco_transforms(image_set, max_size): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [288, 320, 352, 392, 416, 448, 480, 512] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.PhotometricDistort(), + T.RandomSelect( + T.Compose([ + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=max_size), + T.Check(), + ]) + ), + normalize, + ]) + + if image_set == "val": + return T.Compose( + [ + T.RandomResize([360], max_size=640), + normalize, + ] + ) + + raise ValueError(f"unknown {image_set}") + + +def build(dataset_file, image_set, args): + root = Path(args.coco_path) + assert root.exists(), f"provided COCO path {root} does not exist" + mode = "instances" + dataset = dataset_file + PATHS = { + "train": (root / "train2014", root / dataset / f"{mode}_{dataset}_train.json"), + "val": (root / "train2014", root / dataset / f"{mode}_{dataset}_val.json"), + } + + img_folder, ann_file = PATHS[image_set] + dataset = ModulatedDetection( + img_folder, + ann_file, + num_frames=args.num_frames, + transforms=make_coco_transforms(image_set, args.max_size), + return_masks=args.masks, + ) + return dataset diff --git a/my_datasets/transforms_image.py b/my_datasets/transforms_image.py new file mode 100644 index 0000000000000000000000000000000000000000..eff840cdd0f3dc43b4679e654c42d16090cffb30 --- /dev/null +++ b/my_datasets/transforms_image.py @@ -0,0 +1,304 @@ +# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Transforms and data augmentation for both image + bbox. +""" +import random + +import PIL +import torch +import torchvision.transforms as T +import torchvision.transforms.functional as F + +from util.box_ops import box_xyxy_to_cxcywh +from util.misc import interpolate + + +def crop(image, target, region): + cropped_image = F.crop(image, *region) + + target = target.copy() + i, j, h, w = region + + # should we do something wrt the original size? + target["size"] = torch.tensor([h, w]) + + fields = ["labels", "area", "iscrowd", "positive_map", "isfinal"] + + if "boxes" in target: + boxes = target["boxes"] + max_size = torch.as_tensor([w, h], dtype=torch.float32) + cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) + cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) + cropped_boxes = cropped_boxes.clamp(min=0) + area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) + target["boxes"] = cropped_boxes.reshape(-1, 4) + target["area"] = area + fields.append("boxes") + + if "masks" in target: + # FIXME should we update the area here if there are no boxes? + target["masks"] = target["masks"][:, i : i + h, j : j + w] + fields.append("masks") + + # remove elements for which the boxes or masks that have zero area + if "boxes" in target or "masks" in target: + # favor boxes selection when defining which elements to keep + # this is compatible with previous implementation + if "boxes" in target: + cropped_boxes = target["boxes"].reshape(-1, 2, 2) + keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) + else: + keep = target["masks"].flatten(1).any(1) + + for field in fields: + if field in target: + target[field] = target[field][keep] + + return cropped_image, target + + +def hflip(image, target): + flipped_image = F.hflip(image) + + w, h = image.size + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0]) + target["boxes"] = boxes + + if "masks" in target: + target["masks"] = target["masks"].flip(-1) + + if "caption" in target: + caption = target["caption"].replace("left", "[TMP]").replace("right", "left").replace("[TMP]", "right") + target["caption"] = caption + + return flipped_image, target + + +def resize(image, target, size, max_size=None): + # size can be min_size (scalar) or (w, h) tuple + + def get_size_with_aspect_ratio(image_size, size, max_size=None): + w, h = image_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return (oh, ow) + + def get_size(image_size, size, max_size=None): + if isinstance(size, (list, tuple)): + return size[::-1] + else: + return get_size_with_aspect_ratio(image_size, size, max_size) + + size = get_size(image.size, size, max_size) + rescaled_image = F.resize(image, size) + + if target is None: + return rescaled_image, None + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) + ratio_width, ratio_height = ratios + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) + target["boxes"] = scaled_boxes + + if "area" in target: + area = target["area"] + scaled_area = area * (ratio_width * ratio_height) + target["area"] = scaled_area + + h, w = size + target["size"] = torch.tensor([h, w]) + + if "masks" in target: + target["masks"] = interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] > 0.5 + + return rescaled_image, target + + +def pad(image, target, padding): + # assumes that we only pad on the bottom right corners + padded_image = F.pad(image, (0, 0, padding[0], padding[1])) + if target is None: + return padded_image, None + target = target.copy() + # should we do something wrt the original size? + target["size"] = torch.tensor(padded_image[::-1]) + if "masks" in target: + target["masks"] = torch.nn.functional.pad(target["masks"], (0, padding[0], 0, padding[1])) + return padded_image, target + + +class RandomCrop(object): + def __init__(self, size): + self.size = size + + def __call__(self, img, target): + region = T.RandomCrop.get_params(img, self.size) + return crop(img, target, region) + + +class RandomSizeCrop(object): + def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False): + self.min_size = min_size + self.max_size = max_size + self.respect_boxes = respect_boxes # if True we can't crop a box out + + def __call__(self, img: PIL.Image.Image, target: dict): + init_boxes = len(target["boxes"]) + max_patience = 100 + for i in range(max_patience): + w = random.randint(self.min_size, min(img.width, self.max_size)) + h = random.randint(self.min_size, min(img.height, self.max_size)) + region = T.RandomCrop.get_params(img, [h, w]) + result_img, result_target = crop(img, target, region) + if not self.respect_boxes or len(result_target["boxes"]) == init_boxes or i == max_patience - 1: + return result_img, result_target + return result_img, result_target + + +class CenterCrop(object): + def __init__(self, size): + self.size = size + + def __call__(self, img, target): + image_width, image_height = img.size + crop_height, crop_width = self.size + crop_top = int(round((image_height - crop_height) / 2.0)) + crop_left = int(round((image_width - crop_width) / 2.0)) + return crop(img, target, (crop_top, crop_left, crop_height, crop_width)) + + +class RandomHorizontalFlip(object): + def __init__(self, p=0.5): + self.p = p + + def __call__(self, img, target): + if random.random() < self.p: + return hflip(img, target) + return img, target + + +class RandomResize(object): + def __init__(self, sizes, max_size=None): + assert isinstance(sizes, (list, tuple)) + self.sizes = sizes + self.max_size = max_size + + def __call__(self, img, target=None): + size = random.choice(self.sizes) + return resize(img, target, size, self.max_size) + + +class RandomPad(object): + def __init__(self, max_pad): + self.max_pad = max_pad + + def __call__(self, img, target): + pad_x = random.randint(0, self.max_pad) + pad_y = random.randint(0, self.max_pad) + return pad(img, target, (pad_x, pad_y)) + + +class RandomSelect(object): + """ + Randomly selects between transforms1 and transforms2, + with probability p for transforms1 and (1 - p) for transforms2 + """ + + def __init__(self, transforms1, transforms2, p=0.5): + self.transforms1 = transforms1 + self.transforms2 = transforms2 + self.p = p + + def __call__(self, img, target): + if random.random() < self.p: + return self.transforms1(img, target) + return self.transforms2(img, target) + + +class ToTensor(object): + def __call__(self, img, target): + return F.to_tensor(img), target + + +class RandomErasing(object): + def __init__(self, *args, **kwargs): + self.eraser = T.RandomErasing(*args, **kwargs) + + def __call__(self, img, target): + return self.eraser(img), target + + +class Normalize(object): + def __init__(self, mean, std): + self.mean = mean + self.std = std + + def __call__(self, image, target=None): + image = F.normalize(image, mean=self.mean, std=self.std) + if target is None: + return image, None + target = target.copy() + h, w = image.shape[-2:] + if "boxes" in target: + boxes = target["boxes"] + boxes = box_xyxy_to_cxcywh(boxes) + boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32) + target["boxes"] = boxes + return image, target + + +class RemoveDifficult(object): + def __init__(self, enabled=False): + self.remove_difficult = enabled + + def __call__(self, image, target=None): + if target is None: + return image, None + target = target.copy() + keep = ~target["iscrowd"].to(torch.bool) | (not self.remove_difficult) + if "boxes" in target: + target["boxes"] = target["boxes"][keep] + target["labels"] = target["labels"][keep] + target["iscrowd"] = target["iscrowd"][keep] + return image, target + + +class Compose(object): + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, image, target): + for t in self.transforms: + image, target = t(image, target) + return image, target + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + for t in self.transforms: + format_string += "\n" + format_string += " {0}".format(t) + format_string += "\n)" + return format_string diff --git a/my_datasets/transforms_video.py b/my_datasets/transforms_video.py new file mode 100644 index 0000000000000000000000000000000000000000..f2145e9089185af92479328b878f158292d38d02 --- /dev/null +++ b/my_datasets/transforms_video.py @@ -0,0 +1,565 @@ +""" +Transforms and data augmentation for sequence level images, bboxes and masks. +""" +import random + +import PIL +import torch +import torchvision.transforms as T +import torchvision.transforms.functional as F + +from util.box_ops import box_xyxy_to_cxcywh, box_iou +from util.misc import interpolate +import numpy as np +from numpy import random as rand +from PIL import Image +import cv2 + + + +class Check(object): + def __init__(self,): + pass + def __call__(self, img, target): + fields = ["labels"] + if "boxes" in target: + fields.append("boxes") + if "masks" in target: + fields.append("masks") + + ### check if box or mask still exist after transforms + if "boxes" in target or "masks" in target: + if "boxes" in target: + cropped_boxes = target['boxes'].reshape(-1, 2, 2) + keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) + else: + keep = target['masks'].flatten(1).any(1) + + if False in keep: + for k in range(len(keep)): + if not keep[k] and "boxes" in target: + target['boxes'][k] = target['boxes'][k]//1000.0 # [0, 0, 0, 0] + + target['valid'] = keep.to(torch.int32) + + return img, target + + + +def bbox_overlaps(bboxes1, bboxes2, mode='iou', eps=1e-6): + assert mode in ['iou', 'iof'] + bboxes1 = bboxes1.astype(np.float32) + bboxes2 = bboxes2.astype(np.float32) + rows = bboxes1.shape[0] + cols = bboxes2.shape[0] + ious = np.zeros((rows, cols), dtype=np.float32) + if rows * cols == 0: + return ious + exchange = False + if bboxes1.shape[0] > bboxes2.shape[0]: + bboxes1, bboxes2 = bboxes2, bboxes1 + ious = np.zeros((cols, rows), dtype=np.float32) + exchange = True + area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1]) + area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1]) + for i in range(bboxes1.shape[0]): + x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0]) + y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1]) + x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2]) + y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3]) + overlap = np.maximum(x_end - x_start, 0) * np.maximum(y_end - y_start, 0) + if mode == 'iou': + union = area1[i] + area2 - overlap + else: + union = area1[i] if not exchange else area2 + union = np.maximum(union, eps) + ious[i, :] = overlap / union + if exchange: + ious = ious.T + return ious + + +def crop(clip, target, region): + cropped_image = [] + for image in clip: + cropped_image.append(F.crop(image, *region)) + + target = target.copy() + i, j, h, w = region + + # should we do something wrt the original size? + target["size"] = torch.tensor([h, w]) + + fields = ["labels", "area", "iscrowd"] + + if "boxes" in target: + boxes = target["boxes"] + max_size = torch.as_tensor([w, h], dtype=torch.float32) + cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) + cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) + cropped_boxes = cropped_boxes.clamp(min=0) + area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) + target["boxes"] = cropped_boxes.reshape(-1, 4) + target["area"] = area + fields.append("boxes") + + if "masks" in target: + # FIXME should we update the area here if there are no boxes? + target['masks'] = target['masks'][:, i:i + h, j:j + w] + fields.append("masks") + + return cropped_image, target + + +def hflip(clip, target): + flipped_image = [] + for image in clip: + flipped_image.append(F.hflip(image)) + + w, h = clip[0].size + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0]) + target["boxes"] = boxes + + if "masks" in target: + target['masks'] = target['masks'].flip(-1) + + return flipped_image, target + +def vflip(image,target): + flipped_image = [] + for image in clip: + flipped_image.append(F.vflip(image)) + w, h = clip[0].size + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + boxes = boxes[:, [0, 3, 2, 1]] * torch.as_tensor([1, -1, 1, -1]) + torch.as_tensor([0, h, 0, h]) + target["boxes"] = boxes + + if "masks" in target: + target['masks'] = target['masks'].flip(1) + + return flipped_image, target + +def resize(clip, target, size, max_size=None): + # size can be min_size (scalar) or (w, h) tuple + + def get_size_with_aspect_ratio(image_size, size, max_size=None): + w, h = image_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return (oh, ow) + + def get_size(image_size, size, max_size=None): + if isinstance(size, (list, tuple)): + return size[::-1] + else: + return get_size_with_aspect_ratio(image_size, size, max_size) + + size = get_size(clip[0].size, size, max_size) + rescaled_image = [] + for image in clip: + rescaled_image.append(F.resize(image, size)) + + if target is None: + return rescaled_image, None + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image[0].size, clip[0].size)) + ratio_width, ratio_height = ratios + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) + target["boxes"] = scaled_boxes + + if "area" in target: + area = target["area"] + scaled_area = area * (ratio_width * ratio_height) + target["area"] = scaled_area + + h, w = size + target["size"] = torch.tensor([h, w]) + + if "masks" in target: + if target['masks'].shape[0]>0: + target['masks'] = interpolate( + target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5 + else: + target['masks'] = torch.zeros((target['masks'].shape[0],h,w)) + return rescaled_image, target + + +def pad(clip, target, padding): + # assumes that we only pad on the bottom right corners + padded_image = [] + for image in clip: + padded_image.append(F.pad(image, (0, 0, padding[0], padding[1]))) + if target is None: + return padded_image, None + target = target.copy() + # should we do something wrt the original size? + target["size"] = torch.tensor(padded_image[0].size[::-1]) + if "masks" in target: + target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1])) + return padded_image, target + + +class RandomCrop(object): + def __init__(self, size): + self.size = size + + def __call__(self, img, target): + region = T.RandomCrop.get_params(img, self.size) + return crop(img, target, region) + + +class RandomSizeCrop(object): + def __init__(self, min_size: int, max_size: int): + self.min_size = min_size + self.max_size = max_size + + def __call__(self, img: PIL.Image.Image, target: dict): + w = random.randint(self.min_size, min(img[0].width, self.max_size)) + h = random.randint(self.min_size, min(img[0].height, self.max_size)) + region = T.RandomCrop.get_params(img[0], [h, w]) + return crop(img, target, region) + + +class CenterCrop(object): + def __init__(self, size): + self.size = size + + def __call__(self, img, target): + image_width, image_height = img.size + crop_height, crop_width = self.size + crop_top = int(round((image_height - crop_height) / 2.)) + crop_left = int(round((image_width - crop_width) / 2.)) + return crop(img, target, (crop_top, crop_left, crop_height, crop_width)) + + +class MinIoURandomCrop(object): + def __init__(self, min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3): + self.min_ious = min_ious + self.sample_mode = (1, *min_ious, 0) + self.min_crop_size = min_crop_size + + def __call__(self, img, target): + w,h = img.size + while True: + mode = random.choice(self.sample_mode) + self.mode = mode + if mode == 1: + return img,target + min_iou = mode + boxes = target['boxes'].numpy() + labels = target['labels'] + + for i in range(50): + new_w = rand.uniform(self.min_crop_size * w, w) + new_h = rand.uniform(self.min_crop_size * h, h) + if new_h / new_w < 0.5 or new_h / new_w > 2: + continue + left = rand.uniform(w - new_w) + top = rand.uniform(h - new_h) + patch = np.array((int(left), int(top), int(left + new_w), int(top + new_h))) + if patch[2] == patch[0] or patch[3] == patch[1]: + continue + overlaps = bbox_overlaps(patch.reshape(-1, 4), boxes.reshape(-1, 4)).reshape(-1) + if len(overlaps) > 0 and overlaps.min() < min_iou: + continue + + if len(overlaps) > 0: + def is_center_of_bboxes_in_patch(boxes, patch): + center = (boxes[:, :2] + boxes[:, 2:]) / 2 + mask = ((center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (center[:, 0] < patch[2]) * (center[:, 1] < patch[3])) + return mask + mask = is_center_of_bboxes_in_patch(boxes, patch) + if False in mask: + continue + #TODO: use no center boxes + #if not mask.any(): + # continue + + boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:]) + boxes[:, :2] = boxes[:, :2].clip(min=patch[:2]) + boxes -= np.tile(patch[:2], 2) + target['boxes'] = torch.tensor(boxes) + + img = np.asarray(img)[patch[1]:patch[3], patch[0]:patch[2]] + img = Image.fromarray(img) + width, height = img.size + target['orig_size'] = torch.tensor([height,width]) + target['size'] = torch.tensor([height,width]) + return img,target + + +class RandomContrast(object): + def __init__(self, lower=0.5, upper=1.5): + self.lower = lower + self.upper = upper + assert self.upper >= self.lower, "contrast upper must be >= lower." + assert self.lower >= 0, "contrast lower must be non-negative." + def __call__(self, image, target): + + if rand.randint(2): + alpha = rand.uniform(self.lower, self.upper) + image *= alpha + return image, target + +class RandomBrightness(object): + def __init__(self, delta=32): + assert delta >= 0.0 + assert delta <= 255.0 + self.delta = delta + def __call__(self, image, target): + if rand.randint(2): + delta = rand.uniform(-self.delta, self.delta) + image += delta + return image, target + +class RandomSaturation(object): + def __init__(self, lower=0.5, upper=1.5): + self.lower = lower + self.upper = upper + assert self.upper >= self.lower, "contrast upper must be >= lower." + assert self.lower >= 0, "contrast lower must be non-negative." + + def __call__(self, image, target): + if rand.randint(2): + image[:, :, 1] *= rand.uniform(self.lower, self.upper) + return image, target + +class RandomHue(object): # + def __init__(self, delta=18.0): + assert delta >= 0.0 and delta <= 360.0 + self.delta = delta + + def __call__(self, image, target): + if rand.randint(2): + image[:, :, 0] += rand.uniform(-self.delta, self.delta) + image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0 + image[:, :, 0][image[:, :, 0] < 0.0] += 360.0 + return image, target + +class RandomLightingNoise(object): + def __init__(self): + self.perms = ((0, 1, 2), (0, 2, 1), + (1, 0, 2), (1, 2, 0), + (2, 0, 1), (2, 1, 0)) + def __call__(self, image, target): + if rand.randint(2): + swap = self.perms[rand.randint(len(self.perms))] + shuffle = SwapChannels(swap) # shuffle channels + image = shuffle(image) + return image, target + +class ConvertColor(object): + def __init__(self, current='BGR', transform='HSV'): + self.transform = transform + self.current = current + + def __call__(self, image, target): + if self.current == 'BGR' and self.transform == 'HSV': + image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) + elif self.current == 'HSV' and self.transform == 'BGR': + image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) + else: + raise NotImplementedError + return image, target + +class SwapChannels(object): + def __init__(self, swaps): + self.swaps = swaps + def __call__(self, image): + image = image[:, :, self.swaps] + return image + +class PhotometricDistort(object): + def __init__(self): + self.pd = [ + RandomContrast(), + ConvertColor(transform='HSV'), + RandomSaturation(), + RandomHue(), + ConvertColor(current='HSV', transform='BGR'), + RandomContrast() + ] + self.rand_brightness = RandomBrightness() + self.rand_light_noise = RandomLightingNoise() + + def __call__(self,clip,target): + imgs = [] + for img in clip: + img = np.asarray(img).astype('float32') + img, target = self.rand_brightness(img, target) + if rand.randint(2): + distort = Compose(self.pd[:-1]) + else: + distort = Compose(self.pd[1:]) + img, target = distort(img, target) + img, target = self.rand_light_noise(img, target) + imgs.append(Image.fromarray(img.astype('uint8'))) + return imgs, target + +# NOTICE: if used for mask, need to change +class Expand(object): + def __init__(self, mean): + self.mean = mean + def __call__(self, clip, target): + if rand.randint(2): + return clip,target + imgs = [] + masks = [] + image = np.asarray(clip[0]).astype('float32') + height, width, depth = image.shape + ratio = rand.uniform(1, 4) + left = rand.uniform(0, width*ratio - width) + top = rand.uniform(0, height*ratio - height) + for i in range(len(clip)): + image = np.asarray(clip[i]).astype('float32') + expand_image = np.zeros((int(height*ratio), int(width*ratio), depth),dtype=image.dtype) + expand_image[:, :, :] = self.mean + expand_image[int(top):int(top + height),int(left):int(left + width)] = image + imgs.append(Image.fromarray(expand_image.astype('uint8'))) + expand_mask = torch.zeros((int(height*ratio), int(width*ratio)),dtype=torch.uint8) + expand_mask[int(top):int(top + height),int(left):int(left + width)] = target['masks'][i] + masks.append(expand_mask) + boxes = target['boxes'].numpy() + boxes[:, :2] += (int(left), int(top)) + boxes[:, 2:] += (int(left), int(top)) + target['boxes'] = torch.tensor(boxes) + target['masks']=torch.stack(masks) + return imgs, target + +class RandomHorizontalFlip(object): + def __init__(self, p=0.5): + self.p = p + + def __call__(self, img, target): + if random.random() < self.p: + # NOTE: caption for 'left' and 'right' should also change + caption = target['caption'] + target['caption'] = caption.replace('left', '@').replace('right', 'left').replace('@', 'right') + return hflip(img, target) + return img, target + +class RandomVerticalFlip(object): + def __init__(self, p=0.5): + self.p = p + + def __call__(self, img, target): + if random.random() < self.p: + return vflip(img, target) + return img, target + + +class RandomResize(object): + def __init__(self, sizes, max_size=None): + assert isinstance(sizes, (list, tuple)) + self.sizes = sizes + self.max_size = max_size + + def __call__(self, img, target=None): + size = random.choice(self.sizes) + return resize(img, target, size, self.max_size) + + +class RandomPad(object): + def __init__(self, max_pad): + self.max_pad = max_pad + + def __call__(self, img, target): + pad_x = random.randint(0, self.max_pad) + pad_y = random.randint(0, self.max_pad) + return pad(img, target, (pad_x, pad_y)) + + +class RandomSelect(object): + """ + Randomly selects between transforms1 and transforms2, + with probability p for transforms1 and (1 - p) for transforms2 + """ + def __init__(self, transforms1, transforms2, p=0.5): + self.transforms1 = transforms1 + self.transforms2 = transforms2 + self.p = p + + def __call__(self, img, target): + if random.random() < self.p: + return self.transforms1(img, target) + return self.transforms2(img, target) + + +class ToTensor(object): + def __call__(self, clip, target): + img = [] + for im in clip: + img.append(F.to_tensor(im)) + return img, target + + +class RandomErasing(object): + + def __init__(self, *args, **kwargs): + self.eraser = T.RandomErasing(*args, **kwargs) + + def __call__(self, img, target): + return self.eraser(img), target + + +class Normalize(object): + def __init__(self, mean, std): + self.mean = mean + self.std = std + + def __call__(self, clip, target=None): + image = [] + for im in clip: + image.append(F.normalize(im, mean=self.mean, std=self.std)) + if target is None: + return image, None + target = target.copy() + h, w = image[0].shape[-2:] + if "boxes" in target: + boxes = target["boxes"] + boxes = box_xyxy_to_cxcywh(boxes) + boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32) + target["boxes"] = boxes + return image, target + + +class Compose(object): + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, image, target): + for t in self.transforms: + image, target = t(image, target) + return image, target + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + for t in self.transforms: + format_string += "\n" + format_string += " {0}".format(t) + format_string += "\n)" + return format_string