diff --git a/.gitattributes b/.gitattributes
index 4a9e052dec3f6cf4bf547aafe35fc8a03b2dc4a1..6ac8e9c365bba2e3b98f815e18395577c9879559 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -46,3 +46,4 @@ LAVT-RIS/refer/data/refcocog/instances.json filter=lfs diff=lfs merge=lfs -text
 LAVT-RIS/refer/data/refcocog/refs(google).p filter=lfs diff=lfs merge=lfs -text
 LAVT-RIS/refer/data/refcocog/refs(umd).p filter=lfs diff=lfs merge=lfs -text
 LAVT-RIS/refer/evaluation/tokenizer/stanford-corenlp-3.4.1.jar filter=lfs diff=lfs merge=lfs -text
+hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b filter=lfs diff=lfs merge=lfs -text
diff --git a/.history/datasets/a2d_20250203174308.py b/.history/datasets/a2d_20250203174308.py
new file mode 100644
index 0000000000000000000000000000000000000000..b826e1310fccfc636bc415d94edbdb384042212f
--- /dev/null
+++ b/.history/datasets/a2d_20250203174308.py
@@ -0,0 +1,247 @@
+"""
+A2D-Sentences data loader
+modified from https://github.com/mttr2021/MTTR/blob/main/datasets/a2d_sentences/a2d_sentences_dataset.py
+"""
+from pathlib import Path
+
+import torch
+from torchvision.io import read_video
+import torchvision.transforms.functional as F
+
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+
+import h5py
+from pycocotools.mask import encode, area
+
+
+def get_image_id(video_id, frame_idx, ref_instance_a2d_id):
+    image_id = f'v_{video_id}_f_{frame_idx}_i_{ref_instance_a2d_id}'
+    return image_id
+
+class A2DSentencesDataset(Dataset):
+    """
+    A Torch dataset for A2D-Sentences.
+    For more information check out: https://kgavrilyuk.github.io/publication/actor_action/ or the original paper at:
+    https://arxiv.org/abs/1803.07485
+    """
+    def __init__(self, image_folder: Path, ann_file: Path, transforms, return_masks: bool, 
+                 num_frames: int, max_skip: int, subset):
+        super(A2DSentencesDataset, self).__init__()
+        dataset_path = str(image_folder)
+        self.mask_annotations_dir = os.path.join(dataset_path, 'text_annotations/a2d_annotation_with_instances')
+        self.videos_dir = os.path.join(dataset_path, 'Release/clips320H')
+        self.ann_file = ann_file
+        self.text_annotations = self.get_text_annotations()
+
+        self._transforms = transforms    
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames     
+        self.max_skip = max_skip
+        self.subset = subset
+
+        print(f'\n {subset} sample num: ', len(self.text_annotations))  
+        print('\n')  
+
+    def get_text_annotations(self):
+        with open(str(self.ann_file), 'r') as f:
+            text_annotations_by_frame = [tuple(a) for a in json.load(f)]
+            return text_annotations_by_frame
+
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2 
+
+    def __len__(self):
+        return len(self.text_annotations)
+    
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            text_query, video_id, frame_idx, instance_id = self.text_annotations[idx]
+
+            text_query = " ".join(text_query.lower().split())  # clean up the text query
+
+            # read the source window frames:
+            video_frames, _, _ = read_video(os.path.join(self.videos_dir, f'{video_id}.mp4'), pts_unit='sec')  # (T, H, W, C)
+            vid_len = len(video_frames)
+            # note that the original a2d dataset is 1 indexed, so we have to subtract 1 from frame_idx
+            frame_id = frame_idx - 1
+
+            if self.subset == 'train':
+                # get a window of window_size frames with frame frame_id in the middle.
+                num_frames = self.num_frames
+                # random sparse sample
+                sample_indx = [frame_id]
+                # local sample
+                sample_id_before = random.randint(1, 3)
+                sample_id_after = random.randint(1, 3)
+                local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
+                sample_indx.extend(local_indx)
+
+                # global sampling
+                if num_frames > 3:
+                    all_inds = list(range(vid_len))
+                    global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
+                    global_n = num_frames - len(sample_indx)
+                    if len(global_inds) > global_n:
+                        select_id = random.sample(range(len(global_inds)), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(global_inds[s_id])
+                    elif vid_len >=global_n:  # sample long range global frames
+                        select_id = random.sample(range(vid_len), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+                    else:
+                        select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))           
+                        for s_id in select_id:                                                                   
+                            sample_indx.append(all_inds[s_id])
+                sample_indx.sort()
+                # find the valid frame index in sampled frame list, there is only one valid frame
+                valid_indices = sample_indx.index(frame_id)
+
+            elif self.subset == 'val':
+                start_idx, end_idx = frame_id - self.num_frames // 2, frame_id + (self.num_frames + 1) // 2
+                sample_indx = []
+                for i in range(start_idx, end_idx):
+                    i = min(max(i, 0), len(video_frames)-1)  # pad out of range indices with edge frames
+                    sample_indx.append(i)
+                sample_indx.sort()
+                # find the valid frame index in sampled frame list, there is only one valid frame
+                valid_indices = sample_indx.index(frame_id)
+
+
+            # read frames 
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for j in range(self.num_frames):
+                frame_indx = sample_indx[j]
+                img = F.to_pil_image(video_frames[frame_indx].permute(2, 0, 1))
+                imgs.append(img)
+
+            # read the instance mask
+            frame_annot_path = os.path.join(self.mask_annotations_dir, video_id, f'{frame_idx:05d}.h5')
+            f = h5py.File(frame_annot_path)
+            instances = list(f['instance'])
+            instance_idx = instances.index(instance_id)  # existence was already validated during init
+
+            instance_masks = np.array(f['reMask'])
+            if len(instances) == 1:
+                instance_masks = instance_masks[np.newaxis, ...]
+            instance_masks = torch.tensor(instance_masks).transpose(1, 2)
+            mask_rles = [encode(mask) for mask in instance_masks.numpy()]
+            mask_areas = area(mask_rles).astype(float)
+            f.close()
+
+            # select the referred mask
+            label = torch.tensor(0, dtype=torch.long)
+            mask = instance_masks[instance_idx].numpy()
+            if (mask > 0).any():
+                y1, y2, x1, x2 = self.bounding_box(mask)
+                box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                valid.append(1)
+            else: # some frame didn't contain the instance
+                box = torch.tensor([0, 0, 0, 0]).to(torch.float) 
+                valid.append(0)
+            mask = torch.from_numpy(mask)
+            labels.append(label)
+            boxes.append(box)
+            masks.append(mask)
+
+            # transform
+            h, w = instance_masks.shape[-2:]
+            labels = torch.stack(labels, dim=0) 
+            boxes = torch.stack(boxes, dim=0) 
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0) 
+            # there is only one valid frame
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'valid_indices': torch.tensor([valid_indices]),
+                'labels': labels,                        # [1,]
+                'boxes': boxes,                          # [1, 4], xyxy
+                'masks': masks,                          # [1, H, W]
+                'valid': torch.tensor(valid),            # [1,]
+                'caption': text_query,
+                'orig_size': torch.as_tensor([int(h), int(w)]), 
+                'size': torch.as_tensor([int(h), int(w)]),
+                'image_id': get_image_id(video_id,frame_idx, instance_id)
+            }
+
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+
+        return imgs, target
+
+
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.a2d_path)
+    assert root.exists(), f'provided A2D-Sentences path {root} does not exist'
+    PATHS = {
+        "train": (root, root / "a2d_sentences_single_frame_train_annotations.json"),
+        "val": (root, root / "a2d_sentences_single_frame_test_annotations.json"),   
+    }
+    img_folder, ann_file = PATHS[image_set]
+    #dataset = A2DSentencesDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), 
+    #                            return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
+    dataset = A2DSentencesDataset(img_folder, ann_file, transforms=None, 
+                                return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
+    return dataset
\ No newline at end of file
diff --git a/.history/datasets/ytvos_ref_20250113131134.py b/.history/datasets/ytvos_ref_20250113131134.py
new file mode 100644
index 0000000000000000000000000000000000000000..49b8fede62b2b4e0fdfbaf3aa9f3902002a82acc
--- /dev/null
+++ b/.history/datasets/ytvos_ref_20250113131134.py
@@ -0,0 +1,241 @@
+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+
+from datasets.categories import ytvos_category_dict as category_dict
+
+
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, 
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder     
+        self.ann_file = ann_file         
+        self._transforms = transforms    
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames     
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()       
+
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))  
+        print('\n')    
+
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+
+            print(vid_meta)
+
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                print(exp_dict)
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+
+                    bins.append((bin_start, bin_end))
+
+                
+                meta = {
+                    'video': vid,
+                    'exp': exp_dict['exp'],
+                    'obj_id': int(exp_dict['obj_id']),
+                    'frames': vid_frames,
+                    'bins': bins,  
+                    'category': vid_meta['objects'][int(exp_dict['obj_id'])]['category']
+                }
+                self.metas.append(meta)
+                
+
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2 
+        
+    def __len__(self):
+        return len(self.metas)
+        
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+
+
+            video, exp, obj_id, category, frames, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins']
+                
+    
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+
+            # num_frames = self.num_frames
+
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+
+                # create the target
+                label =  torch.tensor(category_id) 
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float) 
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0) 
+            boxes = torch.stack(boxes, dim=0) 
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0) 
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]), 
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            
+            
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+
+        return imgs, target
+
+
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, 
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, 
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset
+
diff --git a/.history/datasets/ytvos_ref_20250113131327.py b/.history/datasets/ytvos_ref_20250113131327.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8bce1e2bef3f75fe70214de4116d034bb4c681f
--- /dev/null
+++ b/.history/datasets/ytvos_ref_20250113131327.py
@@ -0,0 +1,241 @@
+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+
+from datasets.categories import ytvos_category_dict as category_dict
+
+
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, 
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder     
+        self.ann_file = ann_file         
+        self._transforms = transforms    
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames     
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()       
+
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))  
+        print('\n')    
+
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+
+            print(vid_meta)
+
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                print(exp_dict)
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+
+                    bins.append((bin_start, bin_end))
+
+                
+                meta = {
+                    'video': vid,
+                    'exp': exp_dict['exp'],
+                    'obj_id': int(exp_dict['obj_id']),
+                    'frames': vid_frames,
+                    'bins': bins,  
+                    'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                }
+                self.metas.append(meta)
+                
+
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2 
+        
+    def __len__(self):
+        return len(self.metas)
+        
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+
+
+            video, exp, obj_id, category, frames, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins']
+                
+    
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+
+            # num_frames = self.num_frames
+
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+
+                # create the target
+                label =  torch.tensor(category_id) 
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float) 
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0) 
+            boxes = torch.stack(boxes, dim=0) 
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0) 
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]), 
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            
+            
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+
+        return imgs, target
+
+
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, 
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, 
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset
+
diff --git a/.history/datasets/ytvos_ref_20250113141118.py b/.history/datasets/ytvos_ref_20250113141118.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ac73aa31f23d2f6777ff1252b84ca7080f02b50
--- /dev/null
+++ b/.history/datasets/ytvos_ref_20250113141118.py
@@ -0,0 +1,241 @@
+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+
+from datasets.categories import ytvos_category_dict as category_dict
+
+
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, 
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder     
+        self.ann_file = ann_file         
+        self._transforms = transforms    
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames     
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()       
+
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))  
+        print('\n')    
+
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            print(vid_meta)
+            print(vid_data)
+
+
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+
+                    bins.append((bin_start, bin_end))
+
+                
+                meta = {
+                    'video': vid,
+                    'exp': exp_dict['exp'],
+                    'obj_id': int(exp_dict['obj_id']),
+                    'frames': vid_frames,
+                    'bins': bins,  
+                    'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                }
+                self.metas.append(meta)
+                
+
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2 
+        
+    def __len__(self):
+        return len(self.metas)
+        
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+
+
+            video, exp, obj_id, category, frames, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins']
+                
+    
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+
+            # num_frames = self.num_frames
+
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+
+                # create the target
+                label =  torch.tensor(category_id) 
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float) 
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0) 
+            boxes = torch.stack(boxes, dim=0) 
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0) 
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]), 
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            
+            
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+
+        return imgs, target
+
+
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, 
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, 
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset
+
diff --git a/.history/datasets/ytvos_ref_20250113162417.py b/.history/datasets/ytvos_ref_20250113162417.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb5327725f8bc17793c8e0f94cbb35d7c8d6d9c8
--- /dev/null
+++ b/.history/datasets/ytvos_ref_20250113162417.py
@@ -0,0 +1,241 @@
+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+
+from datasets.categories import ytvos_category_dict as category_dict
+
+
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, 
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder     
+        self.ann_file = ann_file         
+        self._transforms = transforms    
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames     
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()       
+
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))  
+        print('\n')    
+
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+
+                    bins.append((bin_start, bin_end))
+
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+
+                
+                for frame_id in sample_indx:              
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,  
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+                
+
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2 
+        
+    def __len__(self):
+        return len(self.metas)
+        
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+
+
+            video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
+                
+    
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+
+            # num_frames = self.num_frames
+
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+
+                # create the target
+                label =  torch.tensor(category_id) 
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float) 
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0) 
+            boxes = torch.stack(boxes, dim=0) 
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0) 
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]), 
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            
+            
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+
+        return imgs, target
+
+
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, 
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, 
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset
+
diff --git a/.history/datasets/ytvos_ref_20250113163313.py b/.history/datasets/ytvos_ref_20250113163313.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e1449d07223d2b2aee04d711fafc424284dfa1b
--- /dev/null
+++ b/.history/datasets/ytvos_ref_20250113163313.py
@@ -0,0 +1,248 @@
+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+
+from datasets.categories import ytvos_category_dict as category_dict
+
+
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, 
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder     
+        self.ann_file = ann_file         
+        self._transforms = transforms    
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames     
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()       
+
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))  
+        print('\n')    
+
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+
+        self.metas = []
+        skip_vid_count = 0
+
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+
+            if vid_len < 11:
+                print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+
+                    bins.append((bin_start, bin_end))
+
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+
+                
+                for frame_id in sample_indx:              
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,  
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+                
+
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2 
+        
+    def __len__(self):
+        return len(self.metas)
+        
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+
+
+            video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
+                
+    
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+
+            # num_frames = self.num_frames
+
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+
+                # create the target
+                label =  torch.tensor(category_id) 
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float) 
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0) 
+            boxes = torch.stack(boxes, dim=0) 
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0) 
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]), 
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            
+            
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+
+        return imgs, target
+
+
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, 
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, 
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset
+
diff --git a/.history/datasets/ytvos_ref_20250114201904.py b/.history/datasets/ytvos_ref_20250114201904.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fe76d554794a61fb11e7c5cdb4e1d68592e32e2
--- /dev/null
+++ b/.history/datasets/ytvos_ref_20250114201904.py
@@ -0,0 +1,252 @@
+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+
+from datasets.categories import ytvos_category_dict as category_dict
+
+
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, 
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder     
+        self.ann_file = ann_file         
+        self._transforms = transforms    
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames     
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()       
+
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))  
+        print('\n')    
+
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+
+        self.metas = []
+        skip_vid_count = 0
+
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+
+            print(f"vid_data: {vid_data}")
+            print(f"vid_meta: {vid_meta}")
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+
+                    bins.append((bin_start, bin_end))
+
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+
+                
+                for sample_id in sample_indx:              
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'sample_id' : sample_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,  
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+
+        print(f"skipped {skip_vid_count} short videos")
+                
+
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2 
+        
+    def __len__(self):
+        return len(self.metas)
+        
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+
+
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+                
+    
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+
+            # num_frames = self.num_frames
+
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+
+                # create the target
+                label =  torch.tensor(category_id) 
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float) 
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0) 
+            boxes = torch.stack(boxes, dim=0) 
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0) 
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]), 
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            
+            
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+
+        return imgs, target
+
+
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, 
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, 
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset
+
diff --git a/.history/datasets/ytvos_ref_20250114201908.py b/.history/datasets/ytvos_ref_20250114201908.py
new file mode 100644
index 0000000000000000000000000000000000000000..f566a830e7ae4a35219b6e3034787a803bf83ea7
--- /dev/null
+++ b/.history/datasets/ytvos_ref_20250114201908.py
@@ -0,0 +1,253 @@
+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+
+from datasets.categories import ytvos_category_dict as category_dict
+
+
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, 
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder     
+        self.ann_file = ann_file         
+        self._transforms = transforms    
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames     
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()       
+
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))  
+        print('\n')    
+
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+
+        self.metas = []
+        skip_vid_count = 0
+
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+
+            print(f"vid_data: {vid_data}")
+            print(f"vid_meta: {vid_meta}")
+            
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+
+                    bins.append((bin_start, bin_end))
+
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+
+                
+                for sample_id in sample_indx:              
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'sample_id' : sample_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,  
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+
+        print(f"skipped {skip_vid_count} short videos")
+                
+
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2 
+        
+    def __len__(self):
+        return len(self.metas)
+        
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+
+
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+                
+    
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+
+            # num_frames = self.num_frames
+
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+
+                # create the target
+                label =  torch.tensor(category_id) 
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float) 
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0) 
+            boxes = torch.stack(boxes, dim=0) 
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0) 
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]), 
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            
+            
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+
+        return imgs, target
+
+
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, 
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, 
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset
+
diff --git a/.history/datasets/ytvos_ref_20250114202340.py b/.history/datasets/ytvos_ref_20250114202340.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df05592439e77dd69a7854ae746dda6cab910c7
--- /dev/null
+++ b/.history/datasets/ytvos_ref_20250114202340.py
@@ -0,0 +1,251 @@
+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+
+from datasets.categories import ytvos_category_dict as category_dict
+
+
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, 
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder     
+        self.ann_file = ann_file         
+        self._transforms = transforms    
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames     
+        self.max_skip = max_skip
+        # create video meta data
+        self.vid_data, self.vid_meta = self.prepare_metas()       
+
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))  
+        print('\n')    
+
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+
+        self.metas = []
+        skip_vid_count = 0
+
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+
+            return vid_meta, vid_data
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+
+                    bins.append((bin_start, bin_end))
+
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+
+                
+                for sample_id in sample_indx:              
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'sample_id' : sample_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,  
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+
+        print(f"skipped {skip_vid_count} short videos")
+                
+
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2 
+        
+    def __len__(self):
+        return len(self.metas)
+        
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+
+
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+                
+    
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+
+            # num_frames = self.num_frames
+
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+
+                # create the target
+                label =  torch.tensor(category_id) 
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float) 
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0) 
+            boxes = torch.stack(boxes, dim=0) 
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0) 
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]), 
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            
+            
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+
+        return imgs, target
+
+
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, 
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, 
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset
+
diff --git a/.history/datasets/ytvos_ref_20250114205314.py b/.history/datasets/ytvos_ref_20250114205314.py
new file mode 100644
index 0000000000000000000000000000000000000000..9174bb9ce61fccc70ee309f6b440c0efecfe639c
--- /dev/null
+++ b/.history/datasets/ytvos_ref_20250114205314.py
@@ -0,0 +1,250 @@
+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+
+from datasets.categories import ytvos_category_dict as category_dict
+
+
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, 
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder     
+        self.ann_file = ann_file         
+        self._transforms = transforms    
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames     
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()       
+
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))  
+        print('\n')    
+
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+
+        self.metas = []
+        skip_vid_count = 0
+
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+
+                    bins.append((bin_start, bin_end))
+
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+
+                
+                for sample_id in sample_indx:              
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'sample_id' : sample_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,  
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+
+        print(f"skipped {skip_vid_count} short videos")
+                
+
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2 
+        
+    def __len__(self):
+        return len(self.metas)
+        
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+
+
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+                
+    
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+
+            # num_frames = self.num_frames
+
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+
+                # create the target
+                label =  torch.tensor(category_id) 
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float) 
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0) 
+            boxes = torch.stack(boxes, dim=0) 
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0) 
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]), 
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            
+            
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+
+        return imgs, target
+
+
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, 
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, 
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset
+
diff --git a/.history/datasets/ytvos_ref_20250114211305.py b/.history/datasets/ytvos_ref_20250114211305.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2f7cf0846cb2b617527d149fc2ce9a9c99a9430
--- /dev/null
+++ b/.history/datasets/ytvos_ref_20250114211305.py
@@ -0,0 +1,252 @@
+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+
+from datasets.categories import ytvos_category_dict as category_dict
+
+
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, 
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder     
+        self.ann_file = ann_file         
+        self._transforms = transforms    
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames     
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()       
+
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))  
+        print('\n')    
+
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+
+        self.metas = []
+        skip_vid_count = 0
+
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+
+
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+
+                bins.append((bin_start, bin_end))
+
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+
+
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():      
+                obj_id = exp_dict['obj_id']
+                print(obj_id, type(obj_id))
+                print(vid_meta['objects'].keys())
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta[obj_id]['category']    
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+
+        print(f"skipped {skip_vid_count} short videos")
+                
+
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2 
+        
+    def __len__(self):
+        return len(self.metas)
+        
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+
+
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+                
+    
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+
+            # num_frames = self.num_frames
+
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+
+                # create the target
+                label =  torch.tensor(category_id) 
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float) 
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0) 
+            boxes = torch.stack(boxes, dim=0) 
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0) 
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]), 
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            
+            
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+
+        return imgs, target
+
+
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, 
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, 
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset
+
diff --git a/.history/datasets/ytvos_ref_20250116074326.py b/.history/datasets/ytvos_ref_20250116074326.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0df51a8617ac182d3b5b72628751c3858b1d463
--- /dev/null
+++ b/.history/datasets/ytvos_ref_20250116074326.py
@@ -0,0 +1,239 @@
+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+
+from datasets.categories import ytvos_category_dict as category_dict
+
+
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, 
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder     
+        self.ann_file = ann_file         
+        self._transforms = transforms    
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames     
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()       
+
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))  
+        print('\n')    
+
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+
+        self.metas = []
+        skip_vid_count = 0
+
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+
+
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+
+                bins.append((bin_start, bin_end))
+
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+
+
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():      
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']    
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+
+        print(f"skipped {skip_vid_count} short videos")
+                
+
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2 
+        
+    def __len__(self):
+        return len(self.metas)
+        
+    def __getitem__(self, idx):
+        meta = self.metas[idx]  # dict
+
+        video, sample_indx, bins, frames, obj_id_cat = \
+            meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+
+        # read frames and masks
+        imgs, labels, boxes, masks, valid = [], [], [], [], []
+        for frame_indx in sample_indx:
+            frame_name = frames[frame_indx]
+            img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+            mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+            img = Image.open(img_path).convert('RGB')
+            imgs.append(img)
+
+            mask = Image.open(mask_path).convert('P')
+            mask = np.array(mask)
+                
+            # create the target
+            for obj_id in list(obj_id_cat.keys()):
+                obj_mask = (mask==int(obj_id)).astype(np.float32) # 0,1 binary
+                if (obj_mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(obj_mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float) 
+                    valid.append(0)
+                obj_mask = torch.from_numpy(obj_mask)
+
+                # append
+                masks.append(obj_mask)
+                boxes.append(box)
+                
+
+        # transform
+        w, h = img.size
+        boxes = torch.stack(boxes, dim=0) 
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        masks = torch.stack(masks, dim=0) 
+        target = {
+            'frames_idx': sample_indx, # [T,]
+            'boxes': boxes,                          # [T, 4], xyxy
+            'masks': masks,                          # [T, H, W]
+            'valid': torch.tensor(valid),            # [T,]
+            'obj_ids' : list(obj_id_cat.keys()),
+            'orig_size': torch.as_tensor([int(h), int(w)]), 
+            'size': torch.as_tensor([int(h), int(w)])
+        }
+
+        # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+        if self._transforms:
+            imgs, target = self._transforms(imgs, target)
+            imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+        else:
+            imgs = np.array(imgs)
+            imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            
+            
+        #  # FIXME: handle "valid", since some box may be removed due to random crop
+        # if torch.any(target['valid'] == 1):  # at leatst one instance
+        #     instance_check = True
+        # else:
+        #     idx = random.randint(0, self.__len__() - 1)
+
+        return imgs, target
+
+
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, 
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, 
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset
+
diff --git a/.history/mbench/gpt_ref-ytvos-cy_20250121151513.py b/.history/mbench/gpt_ref-ytvos-cy_20250121151513.py
new file mode 100644
index 0000000000000000000000000000000000000000..aebdedbbef16585f90f8dcfd2c21d26c32440d69
--- /dev/null
+++ b/.history/mbench/gpt_ref-ytvos-cy_20250121151513.py
@@ -0,0 +1,433 @@
+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+from openai import OpenAI
+import base64
+
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+
+
+# Captioner
+ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', 
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', 
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', 
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', 
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',  
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+]
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    
+    cat_names = set()
+    all_captions = dict()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+
+    # cat_names : person, snowboard
+    # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
+    # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
+
+    for cat_name in list(cat_names) :        
+        image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+        image_captions = {}
+
+        captioner = OpenAI()
+
+        #0단계: action의 대상이 될 수 있는가?
+        is_movable = False  
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+
+        # response_check = captioner.chat.completions.create(
+        #     model="gpt-4o",
+        #     messages=[
+        #         {
+        #             "role": "user",
+        #             "content": f"""
+        #                 Can a {cat_name} be a subject of distinct actions or movements? 
+        #                 For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject. 
+        #                 However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
+        #                 Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
+        #                 Answer only YES or NONE.
+        #             """
+        #         }
+        #     ],
+        # )
+        # response_check_content = response_check.choices[0].message.content.strip().lower()
+        # print(f"Movable Check for {cat_name}: {response_check_content}")
+
+        # if response_check_content == "yes": is_movable = True
+            
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.")
+            continue
+
+        for i in range(len(image_paths)):
+            image_path = image_paths[i]
+            frame_name = frame_names[i]
+            base64_image = encode_image(image_path)
+            
+            #1단계: 필터링
+            print(cat_name, frame_name)
+            response1 = captioner.chat.completions.create(
+                model="gpt-4o",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {                            
+                                "type": "text",
+                                
+                                "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions? 
+                                        Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
+                                        Each action should be unique and clearly associated with a specific object. 
+                                        
+                                        Respond with YES if:
+                                        - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
+                                        - The {cat_name}s involve clear, distinguishable actions performed independently.
+
+                                        Respond with NONE if:
+                                        - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
+                                        - Actions are ambiguous, minor, or not clearly visible.
+
+                                        If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE. 
+                                        If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
+
+                                        Answer only YES or NONE."""
+
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            response_content = response1.choices[0].message.content
+            should_caption = True if "yes" in response_content.lower() else False
+            print(f"are {cat_name}s distinguished by action: {response_content}")
+
+            #2단계: dense caption 만들기
+            if should_caption:
+                response2 = captioner.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                
+                                    "text": f"""
+                                            Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image. 
+                                            1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                            2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                            3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                            4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                            5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                            6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                            7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                            8. Include interactions with objects or other entities when they are prominent and observable.
+                                            9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                            Output only the caption.""",
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                )
+
+                caption = response2.choices[0].message.content
+                print(f"{image_path} - {frame_name}: {caption}")
+            else:
+                caption = None
+
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+        
+    # final : also prepare valid object ids
+    valid_obj_ids = []
+    valid_cat_names = list(all_captions.keys())
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat = video_data['annotations'][0][obj_id]['category_name']
+        if cat in valid_cat_names : valid_obj_ids.append(obj_id)
+        
+    return all_captions, valid_obj_ids
+
+
+# Referring expression generator and QA filter
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+
+
+    x_min, y_min, x_max, y_max = bbox   
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)     
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    
+    #cropped object for visibility check
+    cropped_I = I[y_min:y_max, x_min:x_max]
+    pil_cropped_I = Image.fromarray(cropped_I)
+    buff_crop = BytesIO()
+    pil_cropped_I.save(buff_crop, format='JPEG')
+    base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
+
+    #entire image for referring expression generation
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+
+    # 구분 가능 여부 확인
+    generator = OpenAI()
+    response_check = generator.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+
+                        "type": "text",
+                        "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}? 
+                                    Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
+
+                                    Guidelines:
+                                    - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
+                                    - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
+                                    - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
+
+                                    Output only either YES or NONE.
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
+                    }
+                ]
+            },
+        ]
+    )
+
+    response_check_content = response_check.choices[0].message.content.strip().lower()
+    print(f"is object {obj_id} visible: {response_check_content}")
+    
+    if "yes" not in response_check_content:
+        print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
+        return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
+
+    # Referring expression 만들기
+    # generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}. 
+                        Guidelines for creating the referring expression:
+                        1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}). 
+                        2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
+                        3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
+                        4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
+                        5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities. 
+                        6. Use '{cat_name}' as the noun for the referring expressions. 
+                        Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
+                        
+                        {caption}
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                    # {
+                    #     "type": "image_url",
+                    #     "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
+                    # }
+                ],
+            }
+        ],
+    )
+
+    ref_exp = response.choices[0].message.content.strip()
+
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="gpt-4o",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+
+    response2_content = response2.choices[0].message.content
+    describesNotHighlighted = True if "yes" in response2_content.lower() else False
+
+    isValid = True if describesHighlighted and not describesNotHighlighted else False
+
+    print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
+
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+
+    #==================데이터 불러오기===================
+    # # 전체 데이터셋
+    # train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+
+    # # 전체 데이터셋 메타데이터
+    # metas = train_dataset.metas
+
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+
+    vid_ids = list(data.keys())
+
+    all_ref_exps = {}
+
+    #==================GPT 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+
+    # 전체 데이터셋의 vid_id에 대해
+    for i in range(1):
+        vid_id = vid_ids[i]
+        
+        #====캡션 만들기====
+        caption, valid_obj_ids = getCaption(vid_id, data)
+        cats_in_vid = list(caption.keys())
+
+        #====referring expression 만들고 QA filtering====
+        ref_expressions = {}
+        # 각 카테고리별로
+        for cat_name in cats_in_vid:
+            if cat_name not in ref_expressions:
+                ref_expressions[cat_name] = {} 
+
+            # 각 비디오 프레임 별로
+            for frame_name in data[vid_id]['frame_names']:
+
+                if frame_name not in ref_expressions[cat_name]:
+                    ref_expressions[cat_name][frame_name] = {}  # Create frame-level dictionary
+
+                caption = caption[cat_name][frame_name]
+
+                if not caption : continue
+                else :
+                    # 각 obj id별로
+                    for obj_id in valid_obj_ids:
+                        ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
+                        ref_expressions[cat_name][frame_name][obj_id] = ref_exp  # Store ref_exp
+
+                
+        all_ref_exps[vid_id] = ref_expressions
+    
+    with open('mbench/result-cy.json', 'w') as file:
+        json.dump(all_ref_exps, file)
diff --git a/.history/mbench/gpt_ref-ytvos-revised_20250121160858.py b/.history/mbench/gpt_ref-ytvos-revised_20250121160858.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f0ae7a62d1ba1c8a439fef1486935d8eb184a76
--- /dev/null
+++ b/.history/mbench/gpt_ref-ytvos-revised_20250121160858.py
@@ -0,0 +1,428 @@
+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+from openai import OpenAI
+import base64
+
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+
+# Captioner
+ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', 
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', 
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', 
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', 
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',  
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+]
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    
+    cat_names = set()
+    all_captions = dict()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+
+    # cat_names : person, snowboard
+    # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
+    # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
+
+    for cat_name in list(cat_names) :        
+        image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+        image_captions = {}
+
+        captioner = OpenAI()
+
+        #0단계: action의 대상이 될 수 있는가?
+        is_movable = False  
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+
+        # response_check = captioner.chat.completions.create(
+        #     model="gpt-4o",
+        #     messages=[
+        #         {
+        #             "role": "user",
+        #             "content": f"""
+        #                 Can a {cat_name} be a subject of distinct actions or movements? 
+        #                 For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject. 
+        #                 However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
+        #                 Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
+        #                 Answer only YES or NONE.
+        #             """
+        #         }
+        #     ],
+        # )
+        # response_check_content = response_check.choices[0].message.content.strip().lower()
+        # print(f"Movable Check for {cat_name}: {response_check_content}")
+
+        # if response_check_content == "yes": is_movable = True
+            
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.")
+            continue
+
+        for i in range(len(image_paths)):
+            image_path = image_paths[i]
+            frame_name = frame_names[i]
+            base64_image = encode_image(image_path)
+            
+            #1단계: 필터링
+            #print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+            response1 = captioner.chat.completions.create(
+                model="chatgpt-4o-latest",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {                            
+                                "type": "text",
+                                
+                                "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions? 
+                                        Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
+                                        Each action should be unique and clearly associated with a specific object. 
+                                        
+                                        Respond with YES if:
+                                        - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
+                                        - The {cat_name}s involve clear, distinguishable actions performed independently.
+
+                                        Respond with NONE if:
+                                        - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
+                                        - Actions are ambiguous, minor, or not clearly visible.
+
+                                        If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE. 
+                                        If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
+
+                                        Answer only YES or NONE."""
+
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            response_content = response1.choices[0].message.content
+            should_caption = True if "yes" in response_content.lower() else False
+            #print(f"are {cat_name}s distinguished by action: {response_content}")
+
+            #2단계: dense caption 만들기
+            if should_caption:
+                response2 = captioner.chat.completions.create(
+                    model="chatgpt-4o-latest",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                
+                                    "text": f"""
+                                            Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image. 
+                                            1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                            2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                            3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                            4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                            5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                            6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                            7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                            8. Include interactions with objects or other entities when they are prominent and observable.
+                                            9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                            Output only the caption.""",
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                )
+
+                caption = response2.choices[0].message.content
+                #print(f"{image_path} - {frame_name}: {caption}")
+            else:
+                caption = None
+
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+        
+    # final : also prepare valid object ids
+    valid_obj_ids = []
+    valid_cat_names = list(all_captions.keys())
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat = video_data['annotations'][0][obj_id]['category_name']
+        if cat in valid_cat_names : valid_obj_ids.append(obj_id)
+        
+    return all_captions, valid_obj_ids
+
+# Referring expression generator and QA filter
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+
+
+    x_min, y_min, x_max, y_max = bbox   
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)     
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    
+    #cropped object for visibility check
+    cropped_I = I[y_min:y_max, x_min:x_max]
+    pil_cropped_I = Image.fromarray(cropped_I)
+    buff_crop = BytesIO()
+    pil_cropped_I.save(buff_crop, format='JPEG')
+    base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
+
+    #entire image for referring expression generation
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+
+    # 구분 가능 여부 확인
+    generator = OpenAI()
+    response_check = generator.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+
+                        "type": "text",
+                        "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}? 
+                                    Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
+
+                                    Guidelines:
+                                    - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
+                                    - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
+                                    - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
+
+                                    Output only either YES or NONE.
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
+                    }
+                ]
+            },
+        ]
+    )
+
+    response_check_content = response_check.choices[0].message.content.strip().lower()
+    #print(f"is object {obj_id} visible: {response_check_content}")
+    
+    if "yes" not in response_check_content:
+        print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
+        return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
+
+    # Referring expression 만들기
+    # generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}. 
+                        Guidelines for creating the referring expression:
+                        1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}). 
+                        2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
+                        3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
+                        4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
+                        5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities. 
+                        6. Use '{cat_name}' as the noun for the referring expressions. 
+                        Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
+                        
+                        {caption}
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                    # {
+                    #     "type": "image_url",
+                    #     "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
+                    # }
+                ],
+            }
+        ],
+    )
+
+    ref_exp = response.choices[0].message.content.strip()
+
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+
+    response2_content = response2.choices[0].message.content
+    notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
+
+    isValid = True if describesHighlighted and notDescribesNotHighlighted else False
+
+    #print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
+    #print(f"ref exp: {ref_exp}")
+    #print("")
+
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+
+
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+
+    vid_ids = list(data.keys())
+    all_ref_exps = {}
+
+    os.environ['OPENAI_API_KEY'] = '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'
+
+    # 전체 데이터셋의 vid_id에 대해
+    for i in range(50):
+        vid_id = vid_ids[i]
+        
+        #====캡션 만들기====
+        # print("=====================captioner========================")
+        captions, valid_obj_ids = getCaption(vid_id, data)
+        cats_in_vid = list(captions.keys())
+        # print()
+        
+        #====referring expression 만들고 QA filtering====
+        # print("=====================referring expression generator & QA filter========================")
+        ref_expressions = {}
+
+        # 각 카테고리별로
+        for cat_name in cats_in_vid:
+            if cat_name not in ref_expressions:
+                ref_expressions[cat_name] = {} 
+            # 각 비디오 프레임 별로
+            for frame_name in data[vid_id]['frame_names']:
+                # print(f'--------category: {cat_name}, frame_name: {frame_name}')
+                        
+                if frame_name not in ref_expressions[cat_name]:
+                    ref_expressions[cat_name][frame_name] = {}  # Create frame-level dictionary
+                caption = captions[cat_name][frame_name]
+                if not caption : continue
+                else :
+                    # 각 obj id별로
+                    for obj_id in valid_obj_ids:
+                        ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
+                        ref_expressions[cat_name][frame_name][obj_id] = ref_exp  # Store ref_exp
+                
+        all_ref_exps[vid_id] = ref_expressions
+
+
+    with open('mbench/result_revised50.json', 'w') as file:
+        json.dump(all_ref_exps, file, indent=4)
+
+
+
+
+
diff --git a/.history/mbench/gpt_ref-ytvos_20250119070820.py b/.history/mbench/gpt_ref-ytvos_20250119070820.py
new file mode 100644
index 0000000000000000000000000000000000000000..621627209495dc3ef794fc1e1c086f29f21f0c56
--- /dev/null
+++ b/.history/mbench/gpt_ref-ytvos_20250119070820.py
@@ -0,0 +1,286 @@
+from datasets import build_dataset
+import argparse
+import opts
+
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+from openai import OpenAI
+import base64
+
+os.environ['OPENAI_API_KEY'] = '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'
+
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    
+    cat_names = set()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    
+    if len(cat_names) == 1:
+        cat_name = next(iter(cat_names))
+    else:
+        print("more than 2 categories")
+        return -1
+    
+    image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+    image_captions = {}
+
+    captioner = OpenAI()
+    for i in range(len(image_paths)):
+        image_path = image_paths[i]
+        frame_name = frame_names[i]
+        base64_image = encode_image(image_path)
+        
+        #1단계: 필터링
+        response1 = captioner.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        response_content = response1.choices[0].message.content
+        should_caption = True if "yes" in response_content.lower() else False
+
+        #2단계: dense caption 만들기
+        if should_caption:
+            response2 = captioner.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""
+                                    Describe the image in detail focusing on the {cat_name}s' actions. 
+                                    1. Each action should be prominent, clear and unique, describing the corresponding object only.
+                                    2. Avoid overly detailed or indeterminate details such as ‘in anticipation’. 
+                                    3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting. 
+                                    4. Do not include actions that needs to be guessed or suggested.""",
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+
+            caption = response2.choices[0].message.content
+        else:
+            caption = None
+
+        image_captions[frame_name] = caption
+    return image_captions
+
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+
+
+    x_min, y_min, x_max, y_max = bbox   
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)     
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+
+    #ref expression 만들기
+    generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box. 
+                                1. The referring expression describes the action and does not contain information about appearance or location in the picture. 
+                                2. Focus only on prominent actions and avoid overly detailed or indeterminate details. 
+                                3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words. 
+                                4. The referring expression should only describe the highlighted {cat_name} and not any other. 
+                                5. Use '{cat_name}' as the noun for the referring expressions. 
+                                Output only the referring expression.
+                                {caption}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+
+    ref_exp = response.choices[0].message.content
+
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+
+    response2_content = response2.choices[0].message.content
+    describesNotHighlighted = True if "yes" in response2_content.lower() else False
+
+    isValid = True if describesHighlighted and not describesNotHighlighted else False
+
+    print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
+
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+
+def createRefExp(video_id, json_data):
+    video_data = json_data[video_id]
+    obj_ids = list(video_data['annotations'][0].keys())
+    frame_names = video_data['frame_names']
+
+    captions_per_frame = getCaption(video_id, json_data)
+    
+    if captions_per_frame == -1:
+        print("There are more than 2 cateories")
+        return
+    
+
+    video_ref_exps = {}
+
+    for frame_name in frame_names:
+        frame_caption = captions_per_frame[frame_name]
+
+        if frame_caption == None:
+            video_ref_exps[frame_name] = None
+
+        else:
+            frame_ref_exps = {}
+            for obj_id in obj_ids:
+                exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
+                frame_ref_exps[obj_id] = exp_per_obj
+            video_ref_exps[frame_name] = frame_ref_exps
+
+    return video_ref_exps     
+
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+
+    videos = set()
+    with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
+        manual_select = list(file)
+    for frame in manual_select:
+        result = json.loads(frame)
+        videos.add(result['video'])
+    videos = list(videos)
+
+
+    all_video_refs = {}
+    for i in range(10):
+        video_id = videos[i]
+        video_ref = createRefExp(video_id, data)
+        all_video_refs[video_id] = video_ref
\ No newline at end of file
diff --git a/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130183936.py b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130183936.py
new file mode 100644
index 0000000000000000000000000000000000000000..de6149e9bcaafadd04aea9b75a7a3aaf171393ee
--- /dev/null
+++ b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130183936.py
@@ -0,0 +1,199 @@
+import os
+import sys
+from os import path as osp
+from io import BytesIO
+
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+from openai import OpenAI
+import base64
+
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())  
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) 
+
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+
+            cat_cnt = 0
+
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+
+                    else:
+                        alpha = 0.08
+
+                        colored_obj_mask = np.zeros_like(frame)  
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+
+
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        
+                        
+                        
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:  
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+
+                        font_scale = 0.9  
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]                        
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5) 
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+        
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    
+    return encoded_frames, vid_cat_cnts, contoured_frames 
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+
diff --git a/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190533.py b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190533.py
new file mode 100644
index 0000000000000000000000000000000000000000..957a573b4639bcd04b47456a28cb173c6b978650
--- /dev/null
+++ b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190533.py
@@ -0,0 +1,429 @@
+import os
+
+import sys
+from os import path as osp
+from io import BytesIO
+
+from ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+from openai import OpenAI
+import base64
+import json
+
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())  
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) 
+
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+
+            cat_cnt = 0
+
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+
+                    else:
+                        alpha = 0.08
+
+                        colored_obj_mask = np.zeros_like(frame)  
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+
+
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        
+                        
+                        
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:  
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+
+                        font_scale = 0.9  
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]                        
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5) 
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+        
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    
+    return encoded_frames, vid_cat_cnts, contoured_frames 
+
+
+def getCaption(idx, color_mask=True):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+
+    base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
+    marked = "mask with boundary" if color_mask else "boundary"
+
+    for cat_name in list(cat_names) :
+
+        is_movable = False  
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+            
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+
+
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        cont_base64_frames = contoured_frames[cat_name]
+
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+
+                Are {cat_name}s in the image performing all different and recognizable actions or postures? 
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position), 
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people. 
+
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+     
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Each action is unambiguously recognizable and distinct.
+
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) They show no noticeable action beyond standing or minor movements.
+
+                Answer strictly with either "YES" or "NONE".
+                """
+
+
+                response1 = captioner.chat.completions.create(
+                    model="chatgpt-4o-latest",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {                            
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+                
+            else:
+                should_caption = False
+
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
+                                        In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+                                        I want to use your expressions to create a action-centric referring expression dataset.
+                                        Therefore, your expressions for these {cat_name}s should describe unique action of each object. 
+                                        
+                                        1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                        2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                        3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                        4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                        5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                        6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                        7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                        8. Include interactions with objects or other entities when they are prominent and observable.
+                                        9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                        10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
+                                        11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
+                                        12. Do not mention object IDs.
+                                        13. Use '{cat_name}' as the noun for the referring expressions.
+
+                                        Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). 
+                                        Output referring expressions for each object id.
+                                        """
+                                        
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.             
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+            I want to use your expressions to create a action-centric referring expression dataset.
+            Please describe each {cat_name} using **clearly observable** and **specific** actions.
+
+            ## Guidelines:
+            1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
+            2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
+            3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
+            4. Do not use vague expressions like "interacting with something"** or "engaging with another object."  
+                Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
+            5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+            6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
+            7. Base your description on the following action definitions:
+            - Facial with object manipulation
+            - General body movement, body position or pattern
+            - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
+            - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
+
+            ## Output Format:
+            - For each labeled {cat_name}, output one line in the format:
+            ID. action-oriented description
+
+            Example:
+            1. a bear grasping the edge of a wood with its front paws
+            2. the bear pushing another bear, leaning forward
+
+            **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
+            **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). 
+            For each labeled {cat_name}, output referring expressions for each object id.
+            """
+            if should_caption:
+                response2 = captioner.chat.completions.create(
+                    model="chatgpt-4o-latest",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": dense_caption_prompt,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                )
+
+                caption = response2.choices[0].message.content
+                #print(f"{image_path} - {frame_name}: {caption}")
+            else:
+                caption = None
+
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+        
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+
+    return vid_id, all_captions, valid_obj_ids
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
+    
+    args = parser.parse_args()
+
+    print(args.save_caption_path, flush=True)
+    print(args.save_valid_obj_ids_path, flush=True)
+
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', 
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', 
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', 
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', 
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',  
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    
+    result_captions = {}
+    result_valid_obj_ids = {}
+
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i, True)
+
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+
+    print("Finished!", flush=True)
+
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)
diff --git a/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190813.py b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190813.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b07f482fc6ac58f78b690db64f24454930fef25
--- /dev/null
+++ b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190813.py
@@ -0,0 +1,427 @@
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from os import path as osp
+from io import BytesIO
+
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+from openai import OpenAI
+import base64
+import json
+
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())  
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) 
+
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+
+            cat_cnt = 0
+
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+
+                    else:
+                        alpha = 0.08
+
+                        colored_obj_mask = np.zeros_like(frame)  
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+
+
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        
+                        
+                        
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:  
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+
+                        font_scale = 0.9  
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]                        
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5) 
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+        
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    
+    return encoded_frames, vid_cat_cnts, contoured_frames 
+
+
+def getCaption(idx, color_mask=True):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+
+    base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
+    marked = "mask with boundary" if color_mask else "boundary"
+
+    for cat_name in list(cat_names) :
+
+        is_movable = False  
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+            
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+
+
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        cont_base64_frames = contoured_frames[cat_name]
+
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+
+                Are {cat_name}s in the image performing all different and recognizable actions or postures? 
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position), 
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people. 
+
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+     
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Each action is unambiguously recognizable and distinct.
+
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) They show no noticeable action beyond standing or minor movements.
+
+                Answer strictly with either "YES" or "NONE".
+                """
+
+
+                response1 = captioner.chat.completions.create(
+                    model="chatgpt-4o-latest",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {                            
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+                
+            else:
+                should_caption = False
+
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
+                                        In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+                                        I want to use your expressions to create a action-centric referring expression dataset.
+                                        Therefore, your expressions for these {cat_name}s should describe unique action of each object. 
+                                        
+                                        1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                        2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                        3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                        4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                        5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                        6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                        7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                        8. Include interactions with objects or other entities when they are prominent and observable.
+                                        9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                        10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
+                                        11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
+                                        12. Do not mention object IDs.
+                                        13. Use '{cat_name}' as the noun for the referring expressions.
+
+                                        Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). 
+                                        Output referring expressions for each object id.
+                                        """
+                                        
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.             
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+            I want to use your expressions to create a action-centric referring expression dataset.
+            Please describe each {cat_name} using **clearly observable** and **specific** actions.
+
+            ## Guidelines:
+            1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
+            2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
+            3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
+            4. Do not use vague expressions like "interacting with something"** or "engaging with another object."  
+                Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
+            5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+            6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
+            7. Base your description on the following action definitions:
+            - Facial with object manipulation
+            - General body movement, body position or pattern
+            - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
+            - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
+
+            ## Output Format:
+            - For each labeled {cat_name}, output one line in the format:
+            ID. action-oriented description
+
+            Example:
+            1. a bear grasping the edge of a wood with its front paws
+            2. the bear pushing another bear, leaning forward
+
+            **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
+            **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). 
+            For each labeled {cat_name}, output referring expressions for each object id.
+            """
+            if should_caption:
+                response2 = captioner.chat.completions.create(
+                    model="chatgpt-4o-latest",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": dense_caption_prompt,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                )
+
+                caption = response2.choices[0].message.content
+                #print(f"{image_path} - {frame_name}: {caption}")
+            else:
+                caption = None
+
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+        
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+
+    return vid_id, all_captions, valid_obj_ids
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
+    
+    args = parser.parse_args()
+
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', 
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', 
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', 
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', 
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',  
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    
+    result_captions = {}
+    result_valid_obj_ids = {}
+
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i, True)
+
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+
+    print("Finished!", flush=True)
+
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)
diff --git a/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130220417.py b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130220417.py
new file mode 100644
index 0000000000000000000000000000000000000000..7edcef6aa7554657892aff2516273e8bd84a7da1
--- /dev/null
+++ b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250130220417.py
@@ -0,0 +1,427 @@
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from os import path as osp
+from io import BytesIO
+
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+from openai import OpenAI
+import base64
+import json
+
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())  
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) 
+
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+
+            cat_cnt = 0
+
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+
+                    else:
+                        alpha = 0.08
+
+                        colored_obj_mask = np.zeros_like(frame)  
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+
+
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        
+                        
+                        
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:  
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+
+                        font_scale = 0.9  
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]                        
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5) 
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+        
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    
+    return encoded_frames, vid_cat_cnts, contoured_frames 
+
+
+def getCaption(idx, color_mask=True):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+
+    base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
+    marked = "mask with boundary" if color_mask else "boundary"
+
+    for cat_name in list(cat_names) :
+
+        is_movable = False  
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+            
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+
+
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        cont_base64_frames = contoured_frames[cat_name]
+
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+
+                Are {cat_name}s in the image performing all different and recognizable actions or postures? 
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position), 
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people. 
+
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+     
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Each action is unambiguously recognizable and distinct.
+
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) They show no noticeable action beyond standing or minor movements.
+
+                Answer strictly with either "YES" or "NONE".
+                """
+
+
+                response1 = captioner.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {                            
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+                
+            else:
+                should_caption = False
+
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
+                                        In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+                                        I want to use your expressions to create a action-centric referring expression dataset.
+                                        Therefore, your expressions for these {cat_name}s should describe unique action of each object. 
+                                        
+                                        1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                        2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                        3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                        4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                        5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                        6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                        7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                        8. Include interactions with objects or other entities when they are prominent and observable.
+                                        9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                        10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
+                                        11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
+                                        12. Do not mention object IDs.
+                                        13. Use '{cat_name}' as the noun for the referring expressions.
+
+                                        Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). 
+                                        Output referring expressions for each object id.
+                                        """
+                                        
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.             
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+            I want to use your expressions to create a action-centric referring expression dataset.
+            Please describe each {cat_name} using **clearly observable** and **specific** actions.
+
+            ## Guidelines:
+            1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
+            2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
+            3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
+            4. Do not use vague expressions like "interacting with something"** or "engaging with another object."  
+                Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
+            5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+            6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
+            7. Base your description on the following action definitions:
+            - Facial with object manipulation
+            - General body movement, body position or pattern
+            - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
+            - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
+
+            ## Output Format:
+            - For each labeled {cat_name}, output one line in the format:
+            ID. action-oriented description
+
+            Example:
+            1. a bear grasping the edge of a wood with its front paws
+            2. the bear pushing another bear, leaning forward
+
+            **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
+            **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). 
+            For each labeled {cat_name}, output referring expressions for each object id.
+            """
+            if should_caption:
+                response2 = captioner.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": dense_caption_prompt,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                )
+
+                caption = response2.choices[0].message.content
+                #print(f"{image_path} - {frame_name}: {caption}")
+            else:
+                caption = None
+
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+        
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+
+    return vid_id, all_captions, valid_obj_ids
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
+    
+    args = parser.parse_args()
+
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', 
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', 
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', 
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', 
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',  
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    
+    result_captions = {}
+    result_valid_obj_ids = {}
+
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i, True)
+
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+
+    print("Finished!", flush=True)
+
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)
diff --git a/.history/mbench/gpt_ref-ytvos_numbered_cy_20250201140559.py b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250201140559.py
new file mode 100644
index 0000000000000000000000000000000000000000..48dc049fb725cde5fd97d6e89935ecf0286ba0d2
--- /dev/null
+++ b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250201140559.py
@@ -0,0 +1,461 @@
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+
+from os import path as osp
+from io import BytesIO
+
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+from openai import OpenAI
+import base64
+import json
+
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())  
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) 
+
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+
+            cat_cnt = 0
+
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+
+                    else:
+                        alpha = 0.08
+
+                        colored_obj_mask = np.zeros_like(frame)  
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+
+
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        
+                        
+                        
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:  
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+
+                        font_scale = 0.9  
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]                        
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5) 
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+        
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    
+    return encoded_frames, vid_cat_cnts, contoured_frames 
+
+
+def getCaption(idx, model='gpt-4o', color_mask=True):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+
+    base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
+    #marked = "mask with boundary" if color_mask else "boundary"
+
+    for cat_name in list(cat_names) :
+
+        is_movable = False  
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+            
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+
+
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        cont_base64_frames = contoured_frames[cat_name]
+
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+
+                Are {cat_name}s in the image performing all different and recognizable actions or postures? 
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position), 
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people. 
+
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+     
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                    (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
+
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
+                3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
+
+                Answer strictly with either "YES" or "NONE".
+                """
+                
+                response1 = captioner.chat.completions.create(
+                    # model="chatgpt-4o-latest",
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {                            
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+                
+            else:
+                should_caption = False
+
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
+                                        In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+                                        I want to use your expressions to create a action-centric referring expression dataset.
+                                        Therefore, your expressions for these {cat_name}s should describe unique action of each object. 
+                                        
+                                        1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                        2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                        3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                        4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                        5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                        6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                        7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                        8. Include interactions with objects or other entities when they are prominent and observable.
+                                        9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                        10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
+                                        11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
+                                        12. Do not mention object IDs.
+                                        13. Use '{cat_name}' as the noun for the referring expressions.
+
+                                        Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). 
+                                        Output referring expressions for each object id.
+                                        """
+                                        
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.  
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+
+            I want to use your expressions to create an **action-centric referring expression** dataset.  
+            Please describe each {cat_name} using **clearly observable** and **specific** actions.
+
+            ---
+            ## Guidelines:
+            1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).  
+            2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").  
+            3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).  
+            4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").  
+            5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.  
+            6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.  
+            7. Base your description on these action definitions:
+            - Avoid using term 'minimal' or 'slightly'.
+            - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
+            - details such as motion and intention, facial with object manipulation
+            - movements with objects or other entities when they are prominent and observable. expression should be specific.
+                (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))           
+            ---
+
+            ## Output Format:
+            - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
+                object id. using {cat_name} as subject noun, action-oriented description        
+                (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
+            - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
+
+            ### Example
+            If the frame has 2 labeled bears, your output should look like:
+            1. the bear reaching his right arm while leaning forward to capture the prey
+            2. a bear standing upright facing right, touching the bike aside
+
+            ---
+            **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
+            **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
+            **Do not include markdown** in the output.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). 
+            For each labeled {cat_name}, output referring expressions for each object id.
+            """
+            MAX_RETRIES = 2
+            retry_count = 0
+
+            if should_caption:
+                while retry_count < MAX_RETRIES:
+
+                    response2 = captioner.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": dense_caption_prompt,
+                                    },
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+
+                    # caption = response2.choices[0].message.content
+                    #print(f"{image_path} - {frame_name}: {caption}")
+
+                    caption = response2.choices[0].message.content.strip() 
+                    caption_lower = caption.lower().lstrip() 
+
+                    if caption_lower.startswith("1.") and not any(
+                        phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+                    ):
+                        break
+
+                    print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+                    retry_count += 1
+                    time.sleep(2) 
+
+                if retry_count == MAX_RETRIES:
+                    caption = None
+                    print("Max retries reached. Caption generation failed.")
+                    
+            else:
+                caption = None
+
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+        
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+
+    return all_captions, valid_obj_ids
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
+    
+    args = parser.parse_args()
+
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', 
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', 
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', 
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', 
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',  
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    
+    result_captions = {}
+    result_valid_obj_ids = {}
+
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i, True)
+
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+
+    print("Finished!", flush=True)
+
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)
diff --git a/.history/mbench/gpt_ref-ytvos_numbered_cy_20250201141240.py b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250201141240.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac10a64448640a89e3d7c035abaf10fcf5d68b7f
--- /dev/null
+++ b/.history/mbench/gpt_ref-ytvos_numbered_cy_20250201141240.py
@@ -0,0 +1,460 @@
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+
+from os import path as osp
+from io import BytesIO
+
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+from openai import OpenAI
+import base64
+import json
+
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())  
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) 
+
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+
+            cat_cnt = 0
+
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+
+                    else:
+                        alpha = 0.08
+
+                        colored_obj_mask = np.zeros_like(frame)  
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+
+
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        
+                        
+                        
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:  
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+
+                        font_scale = 0.9  
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]                        
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5) 
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+        
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    
+    return encoded_frames, vid_cat_cnts, contoured_frames 
+
+
+def getCaption(idx, model='gpt-4o', color_mask=True):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+
+    base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
+    #marked = "mask with boundary" if color_mask else "boundary"
+
+    for cat_name in list(cat_names) :
+
+        is_movable = False  
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+            
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+
+
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        cont_base64_frames = contoured_frames[cat_name]
+
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+
+                Are {cat_name}s in the image performing all different and recognizable actions or postures? 
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position), 
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people. 
+
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+     
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                    (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
+
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
+                3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
+
+                Answer strictly with either "YES" or "NONE".
+                """
+                
+                response1 = captioner.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {                            
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+                
+            else:
+                should_caption = False
+
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
+                                        In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+                                        I want to use your expressions to create a action-centric referring expression dataset.
+                                        Therefore, your expressions for these {cat_name}s should describe unique action of each object. 
+                                        
+                                        1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                        2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                        3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                        4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                        5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                        6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                        7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                        8. Include interactions with objects or other entities when they are prominent and observable.
+                                        9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                        10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
+                                        11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
+                                        12. Do not mention object IDs.
+                                        13. Use '{cat_name}' as the noun for the referring expressions.
+
+                                        Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). 
+                                        Output referring expressions for each object id.
+                                        """
+                                        
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.  
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+
+            I want to use your expressions to create an **action-centric referring expression** dataset.  
+            Please describe each {cat_name} using **clearly observable** and **specific** actions.
+
+            ---
+            ## Guidelines:
+            1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).  
+            2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").  
+            3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).  
+            4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").  
+            5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.  
+            6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.  
+            7. Base your description on these action definitions:
+            - Avoid using term 'minimal' or 'slightly'.
+            - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
+            - details such as motion and intention, facial with object manipulation
+            - movements with objects or other entities when they are prominent and observable. expression should be specific.
+                (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))           
+            ---
+
+            ## Output Format:
+            - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
+                object id. using {cat_name} as subject noun, action-oriented description        
+                (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
+            - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
+
+            ### Example
+            If the frame has 2 labeled bears, your output should look like:
+            1. the bear reaching his right arm while leaning forward to capture the prey
+            2. a bear standing upright facing right, touching the bike aside
+
+            ---
+            **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
+            **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
+            **Do not include markdown** in the output.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). 
+            For each labeled {cat_name}, output referring expressions for each object id.
+            """
+            MAX_RETRIES = 2
+            retry_count = 0
+
+            if should_caption:
+                while retry_count < MAX_RETRIES:
+
+                    response2 = captioner.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": dense_caption_prompt,
+                                    },
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+
+                    # caption = response2.choices[0].message.content
+                    #print(f"{image_path} - {frame_name}: {caption}")
+
+                    caption = response2.choices[0].message.content.strip() 
+                    caption_lower = caption.lower().lstrip() 
+
+                    if caption_lower.startswith("1.") and not any(
+                        phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+                    ):
+                        break
+
+                    print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+                    retry_count += 1
+                    time.sleep(2) 
+
+                if retry_count == MAX_RETRIES:
+                    caption = None
+                    print("Max retries reached. Caption generation failed.")
+                    
+            else:
+                caption = None
+
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+        
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+
+    return all_captions, valid_obj_ids
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
+    
+    args = parser.parse_args()
+
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', 
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', 
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', 
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', 
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',  
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    
+    result_captions = {}
+    result_valid_obj_ids = {}
+
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i, True)
+
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+
+    print("Finished!", flush=True)
+
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)
diff --git a/.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207172754.py b/.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207172754.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5efab129d003d0163b5c6bd9a01eb4d3942a054
--- /dev/null
+++ b/.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207172754.py
@@ -0,0 +1,656 @@
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+
+from os import path as osp
+from io import BytesIO
+import random
+
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+from openai import OpenAI
+import base64
+import json
+import requests
+from openai.error import APIConnectionError, OpenAIError
+
+def number_objects_and_encode_old(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())  
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) 
+
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+
+            cat_cnt = 0
+
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0] 
+                                
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+
+                    else:
+                        alpha = 0.08
+
+                        colored_obj_mask = np.zeros_like(frame)  
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+
+
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:  
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+
+                        font_scale = 0.9  
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]                        
+                        text_x = center_x - text_size[0] // 1
+                        text_y = center_y
+
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5) 
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+        
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    
+    return encoded_frames, contoured_frames, vid_cat_cnts
+
+
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())  
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) 
+
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+
+            cat_cnt = 0
+
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+
+                    contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                    cv2.drawContours(frame, contours, -1, colors[j], 3)
+                    cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+
+                    if len(contours) > 0:
+                        largest_contour = max(contours, key=cv2.contourArea)
+                        M = cv2.moments(largest_contour)
+                        if M["m00"] != 0:
+                            center_x = int(M["m10"] / M["m00"])
+                            center_y = int(M["m01"] / M["m00"])
+                        else:
+                            center_x, center_y = 0, 0
+
+                    font = cv2.FONT_HERSHEY_SIMPLEX
+                    text = obj_id
+                    font_scale = 1.2
+                    text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                    text_x = center_x - text_size[0] // 1
+                    text_y = center_y
+
+                    rect_start = (text_x - 5, text_y - text_size[1] - 5)
+                    rect_end = (text_x + text_size[0] + 5, text_y + 3)
+                    
+                    contour_thickness = 1
+                    rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
+                    rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
+
+                    cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness) 
+                    cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)  
+                    cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+
+
+                    if color_mask:  
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    
+    return encoded_frames, contoured_frames, vid_cat_cnts
+
+
+
+def getCaption(idx, model='gpt-4o'):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    
+    # color_mask = random.choice([True, False])
+    color_mask = random.choices([False, True], weights=[60, 40])[0]
+
+    base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
+    #marked = "mask with boundary" if color_mask else "boundary"
+
+    for cat_name in list(cat_names) :
+
+        is_movable = False  
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+            
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+
+
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        # cont_base64_frames = contoured_frames[cat_name]
+
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            # cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+                
+                
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+
+                Are {cat_name}s in the image performing all different and recognizable actions or postures? 
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position), 
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people. 
+
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+     
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                    (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
+
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
+                3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
+
+                Answer strictly with either "YES" or "NONE".
+                """
+                
+                response1 = captioner.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {                            
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+                
+            else:
+                should_caption = False
+
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
+            
+            Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
+                
+            1. Focus only on clear, unique, and prominent actions that distinguish each object.
+            2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+            3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+            4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+            5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+            6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                - expressions like 'seems to be', 'appears to be' are BANNED!
+            7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+            8. Include interactions with objects or other entities when they are prominent and observable.
+            9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
+            10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
+            11. Do not mention object IDs.
+            12. Use '{cat_name}' as the noun for the referring expressions.
+
+            Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+
+            - Your answer should contain details, and follow the following format:
+                object id. action-oriented description
+                (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
+                      2. a person bending over and touching his boots to tie the shoelace.)
+            - for action-oriented description, use {cat_name} as subject noun
+            
+            **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
+            Please pay attention to the categories of these objects and don’t change them.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). 
+            Output referring expressions for each object id. Please start your answer:"""
+
+
+            dense_caption_prompt_2 = f"""
+            You are an advanced visual language model analyzing a video frame.  
+            In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
+
+            Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.  
+            Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
+
+            ---
+            ## Key Guidelines:
+            1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.  
+            - Example: "grabbing a branch and pulling it down" (**(O) Specific**)  
+            - Avoid: "moving slightly to the side" (**(X) Too vague**)  
+            
+            2. **Do not describe appearance, color, or position**—focus purely on the action.  
+            - (X) "A large brown bear standing on the left"  
+            - (O) "The bear is lifting its front paws and swiping forward."  
+
+            3. **Use dynamic, action-specific verbs** rather than passive descriptions.  
+            - (O) "The giraffe is tilting its head and sniffing the ground."  
+            - (X) "The giraffe is near a tree and looking around."  
+
+            4. **Avoid assumptions, emotions, or speculative phrasing.**  
+            - (X) "The person seems excited" / "The person might be preparing to jump."  
+            - (O) "The person is pushing its front legs against the rock and leaping forward."  
+
+            5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                - expressions like 'seems to be', 'appears to be' are BANNED!
+            6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+
+            7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.  
+            - **Each object should have a unique, descriptive action.**  
+            - (X) "Two dogs are running."  
+            - (O) "1. One dog is chasing another, its legs stretched mid-air.  
+                            2. The other dog is looking back while speeding up."  
+
+            ---
+            ## Output Format:
+            - Each labeled **{cat_name}** should have exactly **one line of description**.
+            - Format: `ID. {cat_name} + action-based description`  
+            - (O) Example:  
+                ```
+                1. The person is leaning forward while opening a bag with both hands.
+                2. The person is holding onto a rope and pulling themselves up.
+                ```
+            - **Ensure that each object is described individually.**  
+            - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).  
+
+            ---
+            ## Additional Instructions:
+            - **Do NOT** use expressions like "it appears that..." or "it seems like...".  
+            - **Do NOT** mention object IDs in the description (only use the provided format).  
+            - **DO NOT** include markdown formatting (no bullet points, no asterisks).  
+            - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.  
+
+            Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
+            """
+
+
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.  
+            In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
+
+            I am building an **action-centric referring expression** dataset.  
+            Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
+
+            ---
+            ## Guidelines:
+            1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).  
+            2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").  
+            3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").  
+            4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").  
+            5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.  
+            6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.  
+            7. Base your descriptions on these principles:
+            - **Avoid words like 'minimal' or 'slightly'.**  
+            - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").  
+            - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").  
+            - **Specify actions with other objects or entities** only when they are clear and observable.  
+                - (O) "pushing another person"  
+                - (X) "interacting with another object"  
+
+            ---
+            ## Output Format:
+            - Each labeled **{cat_name}** must have **exactly one line**.
+            - Format: `ID. {cat_name} + action-based description`  
+            - (O) Example:  
+                ```
+                1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
+                2. The person is pulling a baby carriage while smiling.
+                ```
+            - **Ensure each object is described individually.**  
+            - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).  
+
+            ---
+            ## Example:
+            If the frame has two labeled **bears**, your output should be:
+            ```
+            1. The bear is reaching out its right paw while leaning forward to catch prey.
+            2. A bear is standing upright, facing right, and touching the bike beside it.
+            ```
+
+            ---
+            ## Additional Instructions:
+            - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").  
+            - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).  
+            - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).  
+            - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.  
+
+            Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
+
+
+            MAX_RETRIES = 3
+            retry_count = 0
+
+            if should_caption:
+                while retry_count < MAX_RETRIES:
+                    selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
+
+                    response2 = captioner.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": selected_prompt,
+                                    },
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+
+                    # caption = response2.choices[0].message.content
+                    #print(f"{image_path} - {frame_name}: {caption}")
+
+                    caption = response2.choices[0].message.content.strip() 
+                    caption_lower = caption.lower().lstrip() 
+
+                    if caption_lower.startswith("1.") and not any(
+                        phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+                    ):
+                        break
+
+                    print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+                    retry_count += 1
+                    time.sleep(2) 
+
+                if retry_count == MAX_RETRIES:
+                    caption = None
+                    print("Max retries reached. Caption generation failed.")
+                    
+            else:
+                caption = None
+
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+        
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+
+    return vid_id, all_captions, valid_obj_ids
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
+    
+    args = parser.parse_args()
+
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', 
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', 
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', 
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', 
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',  
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    
+    result_captions = {}
+    result_valid_obj_ids = {}
+
+    for i in range(len(metas)):
+        try:
+            vid_id, all_captions, valid_obj_ids = getCaption(i)
+
+            if vid_id not in result_captions:
+                result_captions[vid_id] = all_captions
+            if vid_id not in result_valid_obj_ids:
+                result_valid_obj_ids[vid_id] = valid_obj_ids
+
+        except (requests.exceptions.ConnectionError, APIConnectionError) as e:
+            print(f"created caption until {i}", flush=True)
+            
+            with open(args.save_caption_path, "w") as file:
+                json.dump(result_captions, file, indent=4)
+    
+            with open(args.save_valid_obj_ids_path, "w") as file:
+                json.dump(result_valid_obj_ids, file, indent=4)
+
+    print("Finished!", flush=True)
+
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)
diff --git a/.history/mbench/make_ref-ytvos_json_20250113182322.py b/.history/mbench/make_ref-ytvos_json_20250113182322.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dda79faa25d68f38e673bb8632fe5549671aa88
--- /dev/null
+++ b/.history/mbench/make_ref-ytvos_json_20250113182322.py
@@ -0,0 +1,100 @@
+from datasets import build_dataset
+import argparse
+import opts
+
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import io
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+
+    #초기화
+    data_idx = 0
+    
+    while data_idx < 10:
+
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+
+        while metas[data_idx]['video'] == video_id:
+        
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+
+            frames = metas[data_idx]['frames']
+            
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
+
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+
+            
+            annotation_data.append(obj_data)
+
+            frame_names.append(frame_name)
+                    
+            data_idx += 1
+
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+
+        entire_json[video_id] = video_data
+
+    return entire_json
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    entire_json = json.dumps(entire_json_dict, indent=4)
+
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)
diff --git a/.history/mbench/make_ref-ytvos_json_20250113182734.py b/.history/mbench/make_ref-ytvos_json_20250113182734.py
new file mode 100644
index 0000000000000000000000000000000000000000..d460275ab038cc8b9d9087e1e3595de21ef69a14
--- /dev/null
+++ b/.history/mbench/make_ref-ytvos_json_20250113182734.py
@@ -0,0 +1,102 @@
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from datasets import build_dataset
+import argparse
+import opts
+
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import io
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+
+    #초기화
+    data_idx = 0
+    
+    while data_idx < 10:
+
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+
+        while metas[data_idx]['video'] == video_id:
+        
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+
+            frames = metas[data_idx]['frames']
+            
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
+
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+
+            
+            annotation_data.append(obj_data)
+
+            frame_names.append(frame_name)
+                    
+            data_idx += 1
+
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+
+        entire_json[video_id] = video_data
+
+    return entire_json
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    entire_json = json.dumps(entire_json_dict, indent=4)
+
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)
diff --git a/.history/mbench/make_ref-ytvos_json_20250113182817.py b/.history/mbench/make_ref-ytvos_json_20250113182817.py
new file mode 100644
index 0000000000000000000000000000000000000000..5675fde75aad78185c0398149d2800b28879cde6
--- /dev/null
+++ b/.history/mbench/make_ref-ytvos_json_20250113182817.py
@@ -0,0 +1,103 @@
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from datasets import build_dataset
+import argparse
+import opts
+
+
+from pathlib import Path
+import os
+from os import path as osp
+import io
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+
+    #초기화
+    data_idx = 0
+    
+    while data_idx < 10:
+
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+
+        while metas[data_idx]['video'] == video_id:
+        
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+
+            frames = metas[data_idx]['frames']
+            
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
+
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+
+            
+            annotation_data.append(obj_data)
+
+            frame_names.append(frame_name)
+                    
+            data_idx += 1
+
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+
+        entire_json[video_id] = video_data
+
+    return entire_json
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    entire_json = json.dumps(entire_json_dict, indent=4)
+
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)
diff --git a/.history/mbench/make_ref-ytvos_json_20250113182842.py b/.history/mbench/make_ref-ytvos_json_20250113182842.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cdf04b6312f4ae2bda1f420a07d3a0b3de62aef
--- /dev/null
+++ b/.history/mbench/make_ref-ytvos_json_20250113182842.py
@@ -0,0 +1,102 @@
+import sys
+from os import path as osp
+sys.path.append(os.path.abspath(osp.join(osp.dirname(__file__), '..')))
+
+from datasets import build_dataset
+import argparse
+import opts
+
+
+from pathlib import Path
+import io
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+
+    #초기화
+    data_idx = 0
+    
+    while data_idx < 10:
+
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+
+        while metas[data_idx]['video'] == video_id:
+        
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+
+            frames = metas[data_idx]['frames']
+            
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
+
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+
+            
+            annotation_data.append(obj_data)
+
+            frame_names.append(frame_name)
+                    
+            data_idx += 1
+
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+
+        entire_json[video_id] = video_data
+
+    return entire_json
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    entire_json = json.dumps(entire_json_dict, indent=4)
+
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)
diff --git a/.history/mbench/make_ref-ytvos_json_20250113183130.py b/.history/mbench/make_ref-ytvos_json_20250113183130.py
new file mode 100644
index 0000000000000000000000000000000000000000..5123a82c73aa5225d9422c1669e829d11ee28206
--- /dev/null
+++ b/.history/mbench/make_ref-ytvos_json_20250113183130.py
@@ -0,0 +1,102 @@
+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+
+from datasets import build_dataset
+import argparse
+import opts
+
+
+from pathlib import Path
+import io
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+
+    #초기화
+    data_idx = 0
+    
+    while data_idx < 10:
+
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+
+        while metas[data_idx]['video'] == video_id:
+        
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+
+            frames = metas[data_idx]['frames']
+            
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
+
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+
+            
+            annotation_data.append(obj_data)
+
+            frame_names.append(frame_name)
+                    
+            data_idx += 1
+
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+
+        entire_json[video_id] = video_data
+
+    return entire_json
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    entire_json = json.dumps(entire_json_dict, indent=4)
+
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)
diff --git a/.history/mbench/make_ref-ytvos_json_20250116141513.py b/.history/mbench/make_ref-ytvos_json_20250116141513.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1559a3ec3cd2fc53029a482ee09def964606ed6
--- /dev/null
+++ b/.history/mbench/make_ref-ytvos_json_20250116141513.py
@@ -0,0 +1,103 @@
+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+
+from datasets import build_dataset
+import argparse
+import opts
+
+
+from pathlib import Path
+import io
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+
+    #초기화
+    vid_idx = 0
+    
+    while vid_idx < len(train_dataset):
+
+        #하나의 비디오에 대해
+        video_data = {}
+        video_train_frames, video_train_info = train_dataset[vid_idx]
+        video_meta = metas[vid_idx]
+
+        video_id = video_meta['video']
+        video_data['bins'] = video_meta['bins']
+        bin_nums = len(video_meta['bins'])
+        obj_nums = len(list(video_meta['obj_id_cat'].keys()))
+        
+        annotation_data = []
+        frame_names = []
+
+        for i in range(bin_nums):
+            bin_data = {}
+            for j in range(obj_nums):
+                obj_id = str(j+1)
+                obj_data = {
+                    "category_name":video_meta['obj_id_cat'][obj_id],
+                    "bbox":video_train_info['boxes'][i*obj_nums+j, :]
+                }
+                bin_data[obj_id] = obj_data
+            annotation_data.append(bin_data)
+        
+        video_data['annotations'] = annotation_data
+        
+
+        sample_indx = metas[vid_idx]['sample_indx']
+        frames = metas[vid_idx]['frames']
+        for i in sample_indx:
+            frame_name = frames[i]
+            frame_names.append(frame_name)
+
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)               
+        entire_json[video_id] = video_data
+
+        vid_idx += 1
+
+    return entire_json
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    print(type(entire_json_dict))
+    entire_json = json.dumps(entire_json_dict, indent=4)
+
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)
diff --git a/.history/mbench/make_ref-ytvos_json_20250118024325.py b/.history/mbench/make_ref-ytvos_json_20250118024325.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6cbcf7783a5fa3895fce884c8cf62de45c44b12
--- /dev/null
+++ b/.history/mbench/make_ref-ytvos_json_20250118024325.py
@@ -0,0 +1,108 @@
+import sys
+import os
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+
+from datasets import build_dataset
+import argparse
+import opts
+
+
+from pathlib import Path
+import io
+
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+
+
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+
+    #초기화
+    vid_idx = 0
+    
+    while vid_idx < len(train_dataset):
+
+        #하나의 비디오에 대해
+        video_data = {}
+        video_train_frames, video_train_info = train_dataset[vid_idx]
+        video_meta = metas[vid_idx]
+
+        video_id = video_meta['video']
+        video_data['bins'] = video_meta['bins']
+        bin_nums = len(video_meta['bins'])
+        obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())])
+        
+        annotation_data = []
+        frame_names = []
+
+        for i in range(bin_nums):
+            bin_data = {}
+            for j in range(obj_nums):
+                obj_id = str(j+1)
+                try:
+                    obj_data = {
+                        "category_name":video_meta['obj_id_cat'][obj_id],
+                        "bbox":video_train_info['boxes'][i*obj_nums+j, :].tolist(),
+                        "valid":video_train_info['valid'][i*obj_nums+j].item()
+                    }
+                except:
+                    obj_data = {}
+                bin_data[obj_id] = obj_data
+            annotation_data.append(bin_data)
+        
+        video_data['annotations'] = annotation_data
+        
+
+        sample_indx = metas[vid_idx]['sample_indx']
+        frames = metas[vid_idx]['frames']
+        for i in sample_indx:
+            frame_name = frames[i]
+            frame_names.append(frame_name)
+
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)               
+        entire_json[video_id] = video_data
+
+        vid_idx += 1
+
+    return entire_json
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    print(type(entire_json_dict))
+    entire_json = json.dumps(entire_json_dict, indent=4)
+
+    with open('mbench/sampled_frame2.json', mode='w') as file:
+        file.write(entire_json)
diff --git a/.history/mbench/ytvos_ref_20250121152309.py b/.history/mbench/ytvos_ref_20250121152309.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c03ce2c423582837ca12f06dc7b5f3ef6696725
--- /dev/null
+++ b/.history/mbench/ytvos_ref_20250121152309.py
@@ -0,0 +1,264 @@
+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+
+import torch
+from torch.utils.data import Dataset
+
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+
+# from datasets.categories import ytvos_category_dict as category_dict
+
+
+category_dict = {
+    'airplane': 0, 'ape': 1, 'bear': 2, 'bike': 3, 'bird': 4, 'boat': 5, 'bucket': 6, 'bus': 7, 'camel': 8, 'cat': 9, 
+    'cow': 10, 'crocodile': 11, 'deer': 12, 'dog': 13, 'dolphin': 14, 'duck': 15, 'eagle': 16, 'earless_seal': 17, 
+    'elephant': 18, 'fish': 19, 'fox': 20, 'frisbee': 21, 'frog': 22, 'giant_panda': 23, 'giraffe': 24, 'hand': 25, 
+    'hat': 26, 'hedgehog': 27, 'horse': 28, 'knife': 29, 'leopard': 30, 'lion': 31, 'lizard': 32, 'monkey': 33, 
+    'motorbike': 34, 'mouse': 35, 'others': 36, 'owl': 37, 'paddle': 38, 'parachute': 39, 'parrot': 40, 'penguin': 41, 
+    'person': 42, 'plant': 43, 'rabbit': 44, 'raccoon': 45, 'sedan': 46, 'shark': 47, 'sheep': 48, 'sign': 49, 
+    'skateboard': 50, 'snail': 51, 'snake': 52, 'snowboard': 53, 'squirrel': 54, 'surfboard': 55, 'tennis_racket': 56, 
+    'tiger': 57, 'toilet': 58, 'train': 59, 'truck': 60, 'turtle': 61, 'umbrella': 62, 'whale': 63, 'zebra': 64
+}
+
+
+
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool, 
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder     
+        self.ann_file = ann_file         
+        self._transforms = transforms    
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames     
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()       
+
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))  
+        print('\n')    
+
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+
+        self.metas = []
+        skip_vid_count = 0
+
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+
+
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+
+                bins.append((bin_start, bin_end))
+
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+
+
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():      
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']    
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+
+        print(f"skipped {skip_vid_count} short videos")
+                
+
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2 
+        
+    def __len__(self):
+        return len(self.metas)
+        
+    def __getitem__(self, idx):
+        meta = self.metas[idx]  # dict
+
+        video, sample_indx, bins, frames, obj_id_cat = \
+            meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+
+        # read frames and masks
+        annos = {}
+        imgs, labels, boxes, masks, valid = [], [], [], [], []
+        for frame_indx in sample_indx:
+            frame_name = frames[frame_indx]
+            img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+            mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+            img = Image.open(img_path).convert('RGB')
+            imgs.append(img)
+
+            mask = Image.open(mask_path).convert('P')
+            mask = np.array(mask)
+
+            frame_annotations = {}
+            
+            # create the target
+            for obj_id in list(obj_id_cat.keys()):
+                obj_mask = (mask==int(obj_id)).astype(np.float32) # 0,1 binary
+                if (obj_mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(obj_mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                    val = 1
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float) 
+                    valid.append(0)
+                    val = 0
+                obj_mask = torch.from_numpy(obj_mask)
+
+                # append
+                masks.append(obj_mask)
+                boxes.append(box)
+
+                frame_annotations[obj_id] = {
+                    'category_name': obj_id_cat[obj_id],
+                    'bbox': box,
+                    'valid' : val,
+                    'mask': obj_mask
+                }
+
+            annos[frame_indx] = frame_annotations
+
+
+        # transform
+        w, h = img.size
+        boxes = torch.stack(boxes, dim=0) 
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        masks = torch.stack(masks, dim=0) 
+        target = {
+            'frames_idx': sample_indx, # [T,]
+            'boxes': boxes,                          # [T, 4], xyxy
+            'masks': masks,                          # [T, H, W]
+            'valid': torch.tensor(valid),            # [T,]
+            'obj_ids' : list(obj_id_cat.keys()),
+            'orig_size': torch.as_tensor([int(h), int(w)]), 
+            'size': torch.as_tensor([int(h), int(w)])
+        }
+
+        # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+        # if self._transforms:
+        #     imgs, target = self._transforms(imgs, target)
+        #     imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+        # else:
+        imgs = np.array(imgs)
+        imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            
+            
+        #  # FIXME: handle "valid", since some box may be removed due to random crop
+        # if torch.any(target['valid'] == 1):  # at leatst one instance
+        #     instance_check = True
+        # else:
+        #     idx = random.randint(0, self.__len__() - 1)
+
+        return imgs, target, annos
+
+
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks, 
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks, 
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset
+
diff --git a/.history/mbench_a2d/gpt_a2d_numbered_20250205111640.py b/.history/mbench_a2d/gpt_a2d_numbered_20250205111640.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f2b93c1aae931e5d7f2fcea318f2ddc7de47ea2
--- /dev/null
+++ b/.history/mbench_a2d/gpt_a2d_numbered_20250205111640.py
@@ -0,0 +1,82 @@
+from datasets import build_dataset
+import argparse
+import opts
+
+import sys
+import os
+import time
+
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+from io import BytesIO
+import base64
+from PIL import Image
+
+from openai import OpenAI
+
+def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
+    #마스크 색칠할지
+    if color_mask == True:
+        alpha = 0.1
+
+        colored_mask = np.zeros_like(frame)
+        colored_mask[mask == 1] = [255, 0, 0]
+        frame[mask == 1] = (
+            (1 - alpha) * frame[mask == 1] +
+            alpha * colored_mask[mask == 1]
+        )
+
+    #마스크 아웃라인 그리기
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
+
+    #instance_id 적을지
+    if label_number == True:
+        if len(contours) > 0:
+            largest_contour = max(contours, key=cv2.contourArea)
+            M = cv2.moments(largest_contour)
+            if M["m00"] != 0:  
+                center_x = int(M["m10"] / M["m00"])
+                center_y = int(M["m01"] / M["m00"])
+            else:
+                center_x, center_y = 0, 0
+
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            text = str(instance_id)
+            font_scale = 0.6  
+            text_size = cv2.getTextSize(text, font, font_scale, 2)[0]                        
+            text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+            text_y = center_y
+            # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+            
+            # 텍스트 배경 사각형 좌표 계산
+            rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
+            # rect_end = (text_x + text_size[0] + 5, text_y + 5) 
+            rect_end = (text_x + text_size[0] + 5, text_y)
+            
+            cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+            cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
+
+    # plt.figure(figsize=(6, 10))
+    # plt.imshow(frame)
+    # plt.title(text_query)
+    # plt.tight_layout()
+    # plt.axis('off')
+    # plt.show()
+
+    buffer = BytesIO()
+    frame = Image.fromarray(frame)
+    frame.save(buffer, format='jpeg')
+    buffer.seek(0)
+    encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
+
+    return encoded_frame
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+
+    train_dataset = build_dataset('a2d', image_set = 'train', args = args)
+    text_annotations = train_dataset.text_annotations
\ No newline at end of file
diff --git a/.history/mbench_a2d/gpt_a2d_numbered_20250205122340.py b/.history/mbench_a2d/gpt_a2d_numbered_20250205122340.py
new file mode 100644
index 0000000000000000000000000000000000000000..30f5a49a52cd7cf1d026191764a2da47bf509ebd
--- /dev/null
+++ b/.history/mbench_a2d/gpt_a2d_numbered_20250205122340.py
@@ -0,0 +1,196 @@
+from datasets import build_dataset
+import argparse
+import opts
+
+import sys
+import os
+import time
+
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+from io import BytesIO
+import base64
+from PIL import Image
+import json
+
+from openai import OpenAI
+
+def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
+    #마스크 색칠할지
+    if color_mask == True:
+        alpha = 0.1
+
+        colored_mask = np.zeros_like(frame)
+        colored_mask[mask == 1] = [255, 0, 0]
+        frame[mask == 1] = (
+            (1 - alpha) * frame[mask == 1] +
+            alpha * colored_mask[mask == 1]
+        )
+
+    #마스크 아웃라인 그리기
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
+
+    #instance_id 적을지
+    if label_number == True:
+        if len(contours) > 0:
+            largest_contour = max(contours, key=cv2.contourArea)
+            M = cv2.moments(largest_contour)
+            if M["m00"] != 0:  
+                center_x = int(M["m10"] / M["m00"])
+                center_y = int(M["m01"] / M["m00"])
+            else:
+                center_x, center_y = 0, 0
+
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            text = str(instance_id)
+            font_scale = 0.6  
+            text_size = cv2.getTextSize(text, font, font_scale, 2)[0]                        
+            text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+            text_y = center_y
+            # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+            
+            # 텍스트 배경 사각형 좌표 계산
+            rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
+            # rect_end = (text_x + text_size[0] + 5, text_y + 5) 
+            rect_end = (text_x + text_size[0] + 5, text_y)
+            
+            cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+            cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
+
+    # plt.figure(figsize=(6, 10))
+    # plt.imshow(frame)
+    # plt.title(text_query)
+    # plt.tight_layout()
+    # plt.axis('off')
+    # plt.show()
+
+    buffer = BytesIO()
+    frame = Image.fromarray(frame)
+    frame.save(buffer, format='jpeg')
+    buffer.seek(0)
+    encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
+
+    return encoded_frame
+
+def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
+    
+    base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
+
+    captioner = OpenAI()
+
+    #필터링하지 않고 바로 ref exp 만들기
+    dense_caption_prompt = f"""
+    You are a visual assistant analyzing a single frame of a video.  
+    In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
+    I also give you a text query describing the marked object.
+    I want to use your expression to create an **action-centric referring expression** dataset.  
+    Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
+    ---
+    ## Guidelines:
+    1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).  
+    2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").  
+    3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).  
+    4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").  
+    5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.  
+    6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.  
+    7. Base your description on these action definitions:
+    - Avoid using term 'minimal' or 'slightly'.
+    - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
+    - details such as motion and intention, facial with object manipulation
+    - movements with object or other entities when they are prominent and observable. expression should be specific.
+        (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))           
+    --
+    ## Output Format:
+    - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
+        object id. action-oriented description        
+        (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
+    ### Example
+    If the frame has 1 labeled bear, your output should look like:
+    1. the bear reaching his right arm while leaning forward to capture the prey
+    ---
+    **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
+    **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
+    **Do not include markdown** in the output.
+    Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). 
+    For each labeled object, output referring expressions for each object id.
+    """
+    prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
+
+    MAX_RETRIES = 2
+    retry_count = 0
+    
+    while retry_count < MAX_RETRIES:
+        response = captioner.chat.completions.create(
+            model=model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt_with_text_query,
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )    
+            
+
+        caption = response.choices[0].message.content.strip() 
+        caption_lower = caption.lower().lstrip()
+        if caption_lower.startswith("1.") and not any(
+            phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+        ):
+            break
+        print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+        retry_count += 1
+        time.sleep(2)
+        
+        if retry_count == MAX_RETRIES:
+            caption = None
+            print("Max retries reached. Caption generation failed.")
+            
+    else:
+        caption = None
+
+    return caption
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
+    args = parser.parse_args()
+
+    train_dataset = build_dataset('a2d', image_set = 'train', args = args)
+    text_annotations = train_dataset.text_annotations
+
+    all_captions = {}
+
+    os.environ['OPENAI_API_KEY'] = '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'
+    
+    for idx in range(100):
+        imgs, target = train_dataset[idx]
+        frames_idx = target['frames_idx'].tolist()
+        text_query, vid_id, frame_id, instance_id = text_annotations[idx]
+        
+        frame_id = frame_id - 1
+        frame_order = frames_idx.index(frame_id)
+
+        frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
+        mask = target['masks'].numpy().astype(np.uint8).squeeze()
+              
+        caption = getCaption(frame, mask, instance_id, text_query)
+        if vid_id not in all_captions:
+            all_captions[vid_id] = {frame_id : caption}
+        else:
+            all_captions[vid_id][frame_id] = caption
+    
+
+    with open(args.save_caption_path, 'w') as file:
+        json.dump(all_captions, file, indent=4)
+
diff --git a/.history/mbench_a2d/gpt_a2d_numbered_20250205152326.py b/.history/mbench_a2d/gpt_a2d_numbered_20250205152326.py
new file mode 100644
index 0000000000000000000000000000000000000000..077150c0b8dbc312dfdc7335e334720d0caef8e9
--- /dev/null
+++ b/.history/mbench_a2d/gpt_a2d_numbered_20250205152326.py
@@ -0,0 +1,200 @@
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from datasets import build_dataset
+import argparse
+import opts
+import time
+
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+from io import BytesIO
+import base64
+from PIL import Image
+import json
+
+from openai import OpenAI
+
+def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
+    #마스크 색칠할지
+    if color_mask == True:
+        alpha = 0.1
+
+        colored_mask = np.zeros_like(frame)
+        colored_mask[mask == 1] = [255, 0, 0]
+        frame[mask == 1] = (
+            (1 - alpha) * frame[mask == 1] +
+            alpha * colored_mask[mask == 1]
+        )
+
+    #마스크 아웃라인 그리기
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
+
+    #instance_id 적을지
+    if label_number == True:
+        if len(contours) > 0:
+            largest_contour = max(contours, key=cv2.contourArea)
+            M = cv2.moments(largest_contour)
+            if M["m00"] != 0:  
+                center_x = int(M["m10"] / M["m00"])
+                center_y = int(M["m01"] / M["m00"])
+            else:
+                center_x, center_y = 0, 0
+
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            text = str(instance_id)
+            font_scale = 0.6  
+            text_size = cv2.getTextSize(text, font, font_scale, 2)[0]                        
+            text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+            text_y = center_y
+            # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+            
+            # 텍스트 배경 사각형 좌표 계산
+            rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
+            # rect_end = (text_x + text_size[0] + 5, text_y + 5) 
+            rect_end = (text_x + text_size[0] + 5, text_y)
+            
+            cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+            cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
+
+    # plt.figure(figsize=(6, 10))
+    # plt.imshow(frame)
+    # plt.title(text_query)
+    # plt.tight_layout()
+    # plt.axis('off')
+    # plt.show()
+
+    buffer = BytesIO()
+    frame = Image.fromarray(frame)
+    frame.save(buffer, format='jpeg')
+    buffer.seek(0)
+    encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
+
+    return encoded_frame
+
+def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
+    
+    base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
+
+    captioner = OpenAI()
+
+    #필터링하지 않고 바로 ref exp 만들기
+    dense_caption_prompt = f"""
+    You are a visual assistant analyzing a single frame of a video.  
+    In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
+    I also give you a text query describing the marked object.
+    I want to use your expression to create an **action-centric referring expression** dataset.  
+    Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
+    ---
+    ## Guidelines:
+    1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).  
+    2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").  
+    3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).  
+    4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").  
+    5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.  
+    6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.  
+    7. Base your description on these action definitions:
+    - Avoid using term 'minimal' or 'slightly'.
+    - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
+    - details such as motion and intention, facial with object manipulation
+    - movements with object or other entities when they are prominent and observable. expression should be specific.
+        (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))           
+    --
+    ## Output Format:
+    - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
+        object id. action-oriented description        
+        (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
+    ### Example
+    If the frame has 1 labeled bear, your output should look like:
+    1. the bear reaching his right arm while leaning forward to capture the prey
+    ---
+    **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
+    **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
+    **Do not include markdown** in the output.
+    Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). 
+    For each labeled object, output referring expressions for each object id.
+    """
+    prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
+
+    MAX_RETRIES = 2
+    retry_count = 0
+    
+    while retry_count < MAX_RETRIES:
+        response = captioner.chat.completions.create(
+            model=model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt_with_text_query,
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )    
+            
+
+        caption = response.choices[0].message.content.strip() 
+        caption_lower = caption.lower().lstrip()
+        if caption_lower.startswith("1.") and not any(
+            phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+        ):
+            break
+        print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+        retry_count += 1
+        time.sleep(2)
+        
+        if retry_count == MAX_RETRIES:
+            caption = None
+            print("Max retries reached. Caption generation failed.")
+            
+    else:
+        caption = None
+
+    return caption
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
+    args = parser.parse_args()
+
+    train_dataset = build_dataset('a2d', image_set = 'train', args = args)
+    text_annotations = train_dataset.text_annotations
+
+    all_captions = {}
+
+    #os.environ['OPENAI_API_KEY'] = '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'
+    os.environ['OPENAI_API_KEY'] = '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'
+    
+    for idx in range(100):
+        imgs, target = train_dataset[idx]
+        frames_idx = target['frames_idx'].tolist()
+        text_query, vid_id, frame_id, instance_id = text_annotations[idx]
+        print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True)
+        
+        frame_id = frame_id - 1
+        frame_order = frames_idx.index(frame_id)
+
+        frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
+        mask = target['masks'].numpy().astype(np.uint8).squeeze()
+              
+        caption = getCaption(frame, mask, instance_id, text_query)
+        if vid_id not in all_captions:
+            all_captions[vid_id] = {frame_id : caption}
+        else:
+            all_captions[vid_id][frame_id] = caption
+    
+    print("Finished!", flush=True)
+
+    with open(args.save_caption_path, 'w') as file:
+        json.dump(all_captions, file, indent=4)
+
diff --git a/.history/mbench_a2d/gpt_a2d_numbered_20250207110257.py b/.history/mbench_a2d/gpt_a2d_numbered_20250207110257.py
new file mode 100644
index 0000000000000000000000000000000000000000..32811050ac4261c8752eb49187c25e547a742903
--- /dev/null
+++ b/.history/mbench_a2d/gpt_a2d_numbered_20250207110257.py
@@ -0,0 +1,213 @@
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from datasets import build_dataset
+import argparse
+import opts
+import time
+
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+from io import BytesIO
+import base64
+from PIL import Image
+import json
+
+from openai import OpenAI
+
+def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
+    #마스크 색칠할지
+    if color_mask == True:
+        alpha = 0.1
+
+        colored_mask = np.zeros_like(frame)
+        colored_mask[mask == 1] = [255, 0, 0]
+        frame[mask == 1] = (
+            (1 - alpha) * frame[mask == 1] +
+            alpha * colored_mask[mask == 1]
+        )
+
+    #마스크 아웃라인 그리기
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
+
+    #instance_id 적을지
+    if label_number == True:
+        if len(contours) > 0:
+            largest_contour = max(contours, key=cv2.contourArea)
+            M = cv2.moments(largest_contour)
+            if M["m00"] != 0:  
+                center_x = int(M["m10"] / M["m00"])
+                center_y = int(M["m01"] / M["m00"])
+            else:
+                center_x, center_y = 0, 0
+
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            text = str(instance_id)
+            font_scale = 0.6  
+            text_size = cv2.getTextSize(text, font, font_scale, 2)[0]                        
+            text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+            text_y = center_y
+            # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+            
+            # 텍스트 배경 사각형 좌표 계산
+            rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
+            # rect_end = (text_x + text_size[0] + 5, text_y + 5) 
+            rect_end = (text_x + text_size[0] + 5, text_y)
+            
+            cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+            cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
+
+    # plt.figure(figsize=(6, 10))
+    # plt.imshow(frame)
+    # plt.title(text_query)
+    # plt.tight_layout()
+    # plt.axis('off')
+    # plt.show()
+
+    buffer = BytesIO()
+    frame = Image.fromarray(frame)
+    frame.save(buffer, format='jpeg')
+    buffer.seek(0)
+    encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
+
+    return encoded_frame
+
+def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
+    
+    base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
+
+    captioner = OpenAI()
+
+    #필터링하지 않고 바로 ref exp 만들기
+    dense_caption_prompt = f"""
+    You are a visual assistant analyzing a single frame of a video.  
+    In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
+    I also give you a text query describing the marked object.
+    I want to use your expression to create an **action-centric referring expression** dataset.  
+    Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
+    ---
+    ## Guidelines:
+    1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).  
+    2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").  
+    3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).  
+    4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").  
+    5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.  
+    6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.  
+    7. Base your description on these action definitions:
+    - Avoid using term 'minimal' or 'slightly'.
+    - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
+    - details such as motion and intention, facial with object manipulation
+    - movements with object or other entities when they are prominent and observable. expression should be specific.
+        (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))           
+    --
+    ## Output Format:
+    - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
+        object id. action-oriented description        
+        (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
+    ### Example
+    If the frame has 1 labeled bear, your output should look like:
+    1. the bear reaching his right arm while leaning forward to capture the prey
+    ---
+    **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
+    **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
+    **Do not include markdown** in the output.
+    Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). 
+    For each labeled object, output referring expressions for each object id.
+    """
+    prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
+
+    MAX_RETRIES = 2
+    retry_count = 0
+    
+    while retry_count < MAX_RETRIES:
+        response = captioner.chat.completions.create(
+            model=model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt_with_text_query,
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )    
+            
+
+        caption = response.choices[0].message.content.strip() 
+        caption_lower = caption.lower().lstrip()
+        if caption_lower.startswith("1.") and not any(
+            phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+        ):
+            break
+        print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+        retry_count += 1
+        time.sleep(2)
+        
+        if retry_count == MAX_RETRIES:
+            caption = None
+            print("Max retries reached. Caption generation failed.")
+            
+    else:
+        caption = None
+
+    return caption
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
+    args = parser.parse_args()
+
+    train_dataset = build_dataset('a2d', image_set = 'train', args = args)
+    text_annotations = train_dataset.text_annotations
+
+    all_captions = {}
+
+    #os.environ['OPENAI_API_KEY'] = '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'
+    os.environ['OPENAI_API_KEY'] = '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'
+    
+    first_text_query = ""
+    for idx in range(300):
+        imgs, target = train_dataset[idx]
+        frames_idx = target['frames_idx'].tolist()
+        text_query, vid_id, frame_id, instance_id = text_annotations[idx]
+
+        if text_query == first_text_query:
+            continue
+
+        print(f"------------vid id: {vid_id}, frame id: {frame_id}, instance id: {instance_id}", flush=True)
+        
+        frame_id = frame_id - 1
+        frame_order = frames_idx.index(frame_id)
+
+        frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
+        mask = target['masks'].numpy().astype(np.uint8).squeeze()
+              
+        caption = getCaption(frame, mask, instance_id, text_query, model='gpt-4o-mini')
+        
+        if vid_id in all_captions:
+            if frame_id in all_captions[vid_id]:
+                all_captions[vid_id][frame_id][instance_id] = caption
+            else:
+                all_captions[vid_id][frame_id] = {instance_id : caption}
+        else:
+            all_captions[vid_id] = {frame_id : {instance_id: caption}}
+
+        if idx % 50 == 0:
+            with open(args.save_caption_path, 'w') as file:
+                json.dump(all_captions, file, indent=4)
+    
+    print("Finished!", flush=True)
+
+    with open(args.save_caption_path, 'w') as file:
+                json.dump(all_captions, file, indent=4)
+
diff --git a/.history/slurm_script/jupyter_20250121151552.sh b/.history/slurm_script/jupyter_20250121151552.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7f04e43ce8f6b2bb595d2acaa4aa23900c0e08d1
--- /dev/null
+++ b/.history/slurm_script/jupyter_20250121151552.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+#SBATCH --job-name=jupyter
+#SBATCH --partition=a5000
+#SBATCH --nodelist=node04
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out
+
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ srun jupyter notebook --no-browser --port=7890
diff --git a/.history/slurm_script/jupyter_20250121151643.sh b/.history/slurm_script/jupyter_20250121151643.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8016d1cd5bbbde20ce08b458be6636042329d45a
--- /dev/null
+++ b/.history/slurm_script/jupyter_20250121151643.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+#SBATCH --job-name=jupyter
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out
+
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ srun jupyter notebook --no-browser --port=7890
diff --git a/.history/slurm_script/mbench_gpt_a2d_20250205122515.sh b/.history/slurm_script/mbench_gpt_a2d_20250205122515.sh
new file mode 100644
index 0000000000000000000000000000000000000000..272f6b2debfaaf173a3b18e43a41175b6c21e42f
--- /dev/null
+++ b/.history/slurm_script/mbench_gpt_a2d_20250205122515.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+#SBATCH --job-name=mbench_gpt_a2d
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_a2d.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+
+ python3 mbench/gpt_ref-ytvos_numbered_cy.py \
+    --save_caption_path mbench_a2d/numbered_captions.json
\ No newline at end of file
diff --git a/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121155940.sh b/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121155940.sh
new file mode 100644
index 0000000000000000000000000000000000000000..700e8cd581fa8bd7ad478f24dd1a331dca4826d1
--- /dev/null
+++ b/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121155940.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+#SBATCH --job-name=mbench_gpt_ref-ytvos_revised
+#SBATCH --partition=a5000
+#SBATCH --nodelist=node04
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_revised.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+
+ python3 mbench/gpt_ref-ytvos_revised.py
\ No newline at end of file
diff --git a/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121160841.sh b/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121160841.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a1138085006d50d5ac38ab1697dbe9387c27a87c
--- /dev/null
+++ b/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121160841.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+#SBATCH --job-name=mbench_gpt_ref-ytvos_revised50
+#SBATCH --partition=a5000
+#SBATCH --nodelist=node04
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_revised50.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+
+ python3 mbench/gpt_ref-ytvos_revised.py
\ No newline at end of file
diff --git a/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250124085144.sh b/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250124085144.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ebc3e3eb87ce0237841b3d0e21bea3399918ffaa
--- /dev/null
+++ b/.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250124085144.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+#SBATCH --job-name=mbench_gpt_ref-ytvos_revised50
+#SBATCH --partition=a5000
+#SBATCH --nodelist=node04
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_revised50.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+
+ python3 mbench/gpt_ref-ytvos-revised.py
\ No newline at end of file
diff --git a/.history/slurm_script/mbench_gpt_ref-ytvos_20250119070944.sh b/.history/slurm_script/mbench_gpt_ref-ytvos_20250119070944.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5f508bfcaa6330ddfe61012d5cd8f8968f58eee7
--- /dev/null
+++ b/.history/slurm_script/mbench_gpt_ref-ytvos_20250119070944.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+#SBATCH --job-name=mbench_gpt_ref-ytvos
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+
+ python3 mbench/gpt_ref-ytvos.py
\ No newline at end of file
diff --git a/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130190228.sh b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130190228.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6efa2f04f01effd7e59d092a9e0302505d2b7366
--- /dev/null
+++ b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130190228.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+
+ python3 mbench/gpt_ref-ytvos_numbered_cy.py \
+    --save_caption_path mbench/numbered_captions.json \
+    --save_valid_obj_ids_path mbench/numbered_valid_obj_ids.json
\ No newline at end of file
diff --git a/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250201140706.sh b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250201140706.sh
new file mode 100644
index 0000000000000000000000000000000000000000..214982940b3825256ee2667dd84ff0c0b7e328f0
--- /dev/null
+++ b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250201140706.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+
+ python3 mbench/gpt_ref-ytvos_numbered_cy.py \
+    --save_caption_path mbench/numbered_captions_gpt-4o.json \
+    --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o.json
\ No newline at end of file
diff --git a/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250202183206.sh b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250202183206.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9ad0c1b0c158086bcc48659bedcd1edbbffb8ccb
--- /dev/null
+++ b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250202183206.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+
+ python3 mbench/gpt_ref-ytvos_numbered_cy.py \
+    --save_caption_path mbench/numbered_captions_gpt-4o_no_mask_color.json \
+    --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o_no_mask_color.json
\ No newline at end of file
diff --git a/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207171604.sh b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207171604.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a498a1739c34ce060b0e8802a68c2c2ca896c1cc
--- /dev/null
+++ b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207171604.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered_final
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered_final.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+
+ python3 mbench/gpt_ref-ytvos_numbered_cy_sanity_2.py \
+    --save_caption_path mbench/numbered_captions_gpt-4o_final.json \
+    --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o_final.json
\ No newline at end of file
diff --git a/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207172920.sh b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207172920.sh
new file mode 100644
index 0000000000000000000000000000000000000000..27693ebe2eec10425b7ea8820129e0cbeb838ab1
--- /dev/null
+++ b/.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207172920.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered_final
+#SBATCH --partition=a5000
+#SBATCH --nodelist=node04
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered_final.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+
+ python3 mbench/gpt_ref-ytvos_numbered_cy_sanity_2.py \
+    --save_caption_path mbench/numbered_captions_gpt-4o_final.json \
+    --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o_final.json
\ No newline at end of file
diff --git a/hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/ca26d90c9e8e071d0bc31b570aef68306d0be1db4330471d10a117061a15a991.lock b/hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/ca26d90c9e8e071d0bc31b570aef68306d0be1db4330471d10a117061a15a991.lock
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/hf_cache/models--zhiqiulin--clip-flant5-xxl/.no_exist/89bad6fffe1126b24d4360c1e1f69145eb6103aa/pytorch_model.bin b/hf_cache/models--zhiqiulin--clip-flant5-xxl/.no_exist/89bad6fffe1126b24d4360c1e1f69145eb6103aa/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b
new file mode 100644
index 0000000000000000000000000000000000000000..96cf756627594683e4d906d9b3ebd56ed7d7bc5c
--- /dev/null
+++ b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b
+size 9999791010
diff --git a/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/cc6c13cb9acd48b061e2d2664a50963c338b4998 b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/cc6c13cb9acd48b061e2d2664a50963c338b4998
new file mode 100644
index 0000000000000000000000000000000000000000..cc6c13cb9acd48b061e2d2664a50963c338b4998
--- /dev/null
+++ b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/cc6c13cb9acd48b061e2d2664a50963c338b4998
@@ -0,0 +1,962 @@
+{
+  "metadata": {
+    "total_size": 22919639040
+  },
+  "weight_map": {
+    "decoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "decoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "decoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00001-of-00003.bin",
+    "decoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "decoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "decoder.block.0.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "decoder.block.0.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "decoder.block.0.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "decoder.block.0.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "decoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "decoder.block.0.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.0.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.1.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.1.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.1.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.1.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.1.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.1.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.10.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.10.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.10.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.10.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.10.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.10.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.11.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.11.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.11.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.11.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.11.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.11.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.12.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.12.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.12.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.12.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.12.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.12.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.13.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.13.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.13.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.13.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.13.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.13.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.14.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.14.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.14.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.14.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.14.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.14.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.15.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.15.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.15.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.15.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.15.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.15.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.16.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.16.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.16.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.16.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.16.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.16.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.17.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.17.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.17.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.17.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.17.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.17.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.18.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.18.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.18.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.18.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.18.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.18.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.19.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.19.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.19.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.19.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.19.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.19.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.2.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.2.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.2.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.2.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.2.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.2.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.20.layer.1.EncDecAttention.k.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.20.layer.1.EncDecAttention.o.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.20.layer.1.EncDecAttention.q.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.20.layer.1.EncDecAttention.v.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.20.layer.2.DenseReluDense.wo.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.20.layer.2.layer_norm.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.21.layer.1.EncDecAttention.k.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.21.layer.1.EncDecAttention.o.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.21.layer.1.EncDecAttention.q.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.21.layer.1.EncDecAttention.v.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.21.layer.2.DenseReluDense.wo.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.21.layer.2.layer_norm.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.22.layer.1.EncDecAttention.k.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.22.layer.1.EncDecAttention.o.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.22.layer.1.EncDecAttention.q.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.22.layer.1.EncDecAttention.v.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.22.layer.2.DenseReluDense.wo.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.22.layer.2.layer_norm.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.23.layer.1.EncDecAttention.k.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.23.layer.1.EncDecAttention.o.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.23.layer.1.EncDecAttention.q.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.23.layer.1.EncDecAttention.v.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.23.layer.2.DenseReluDense.wo.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.23.layer.2.layer_norm.weight": "pytorch_model-00003-of-00003.bin",
+    "decoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.3.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.3.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.3.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.3.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.3.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.3.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.4.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.4.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.4.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.4.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.4.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.4.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.5.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.5.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.5.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.5.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.5.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.5.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.6.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.6.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.6.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.6.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.6.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.6.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.7.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.7.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.7.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.7.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.7.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.7.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.8.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.8.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.8.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.8.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.8.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.8.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.9.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.9.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.9.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.9.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.9.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.block.9.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin",
+    "decoder.embed_tokens.weight": "pytorch_model-00001-of-00003.bin",
+    "decoder.final_layer_norm.weight": "pytorch_model-00003-of-00003.bin",
+    "encoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.0.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.1.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.10.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.11.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.12.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.13.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.14.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.15.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.16.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.17.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.18.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.19.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.2.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.20.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.21.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.22.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.23.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.3.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.4.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.5.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.6.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.7.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.8.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.9.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.embed_tokens.weight": "pytorch_model-00001-of-00003.bin",
+    "encoder.final_layer_norm.weight": "pytorch_model-00001-of-00003.bin",
+    "lm_head.weight": "pytorch_model-00003-of-00003.bin",
+    "mm_projector.0.bias": "pytorch_model-00003-of-00003.bin",
+    "mm_projector.0.weight": "pytorch_model-00003-of-00003.bin",
+    "mm_projector.2.bias": "pytorch_model-00003-of-00003.bin",
+    "mm_projector.2.weight": "pytorch_model-00003-of-00003.bin",
+    "shared.weight": "pytorch_model-00001-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.embeddings.class_embedding": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.post_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.post_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.pre_layrnorm.bias": "pytorch_model-00003-of-00003.bin",
+    "vision_tower.vision_tower.vision_model.pre_layrnorm.weight": "pytorch_model-00003-of-00003.bin"
+  }
+}
diff --git a/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/cf1c08b23cfa58fa714ab5a4a233b9b42ee9bb9b b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/cf1c08b23cfa58fa714ab5a4a233b9b42ee9bb9b
new file mode 100644
index 0000000000000000000000000000000000000000..cf1c08b23cfa58fa714ab5a4a233b9b42ee9bb9b
--- /dev/null
+++ b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/cf1c08b23cfa58fa714ab5a4a233b9b42ee9bb9b
@@ -0,0 +1,962 @@
+{
+    "metadata": {
+        "total_size": 22919639040
+    },
+    "weight_map": {
+        "decoder.block.0.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "decoder.block.0.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "decoder.block.0.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00001-of-00003.safetensors",
+        "decoder.block.0.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "decoder.block.0.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "decoder.block.0.layer.1.EncDecAttention.k.weight": "model-00001-of-00003.safetensors",
+        "decoder.block.0.layer.1.EncDecAttention.o.weight": "model-00001-of-00003.safetensors",
+        "decoder.block.0.layer.1.EncDecAttention.q.weight": "model-00001-of-00003.safetensors",
+        "decoder.block.0.layer.1.EncDecAttention.v.weight": "model-00001-of-00003.safetensors",
+        "decoder.block.0.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "decoder.block.0.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.0.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.1.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.1.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.1.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.1.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.1.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.1.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.1.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.1.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.1.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.1.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.1.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.1.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.10.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.10.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.10.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.10.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.10.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.10.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.10.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.10.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.10.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.10.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.10.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.10.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.11.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.11.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.11.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.11.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.11.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.11.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.11.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.11.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.11.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.11.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.11.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.11.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.12.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.12.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.12.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.12.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.12.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.12.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.12.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.12.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.12.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.12.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.12.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.12.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.13.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.13.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.13.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.13.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.13.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.13.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.13.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.13.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.13.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.13.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.13.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.13.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.14.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.14.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.14.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.14.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.14.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.14.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.14.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.14.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.14.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.14.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.14.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.14.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.15.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.15.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.15.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.15.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.15.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.15.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.15.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.15.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.15.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.15.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.15.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.15.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.16.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.16.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.16.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.16.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.16.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.16.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.16.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.16.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.16.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.16.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.16.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.16.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.17.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.17.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.17.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.17.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.17.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.17.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.17.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.17.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.17.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.17.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.17.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.17.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.18.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.18.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.18.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.18.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.18.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.18.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.18.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.18.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.18.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.18.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.18.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.18.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.19.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.19.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.19.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.19.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.19.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.19.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.19.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.19.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.19.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.19.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.19.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.19.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.2.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.2.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.2.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.2.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.2.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.2.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.2.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.2.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.2.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.2.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.2.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.2.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.20.layer.0.SelfAttention.k.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.20.layer.0.SelfAttention.o.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.20.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.20.layer.0.SelfAttention.v.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.20.layer.0.layer_norm.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.20.layer.1.EncDecAttention.k.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.20.layer.1.EncDecAttention.o.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.20.layer.1.EncDecAttention.q.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.20.layer.1.EncDecAttention.v.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.20.layer.1.layer_norm.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.20.layer.2.DenseReluDense.wo.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.20.layer.2.layer_norm.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.21.layer.0.SelfAttention.k.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.21.layer.0.SelfAttention.o.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.21.layer.0.SelfAttention.q.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.21.layer.0.SelfAttention.v.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.21.layer.0.layer_norm.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.21.layer.1.EncDecAttention.k.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.21.layer.1.EncDecAttention.o.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.21.layer.1.EncDecAttention.q.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.21.layer.1.EncDecAttention.v.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.21.layer.1.layer_norm.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.21.layer.2.DenseReluDense.wo.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.21.layer.2.layer_norm.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.22.layer.0.SelfAttention.k.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.22.layer.0.SelfAttention.o.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.22.layer.0.SelfAttention.q.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.22.layer.0.SelfAttention.v.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.22.layer.0.layer_norm.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.22.layer.1.EncDecAttention.k.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.22.layer.1.EncDecAttention.o.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.22.layer.1.EncDecAttention.q.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.22.layer.1.EncDecAttention.v.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.22.layer.1.layer_norm.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.22.layer.2.DenseReluDense.wo.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.22.layer.2.layer_norm.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.23.layer.0.SelfAttention.k.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.23.layer.0.SelfAttention.o.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.23.layer.0.SelfAttention.q.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.23.layer.0.SelfAttention.v.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.23.layer.0.layer_norm.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.23.layer.1.EncDecAttention.k.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.23.layer.1.EncDecAttention.o.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.23.layer.1.EncDecAttention.q.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.23.layer.1.EncDecAttention.v.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.23.layer.1.layer_norm.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.23.layer.2.DenseReluDense.wo.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.23.layer.2.layer_norm.weight": "model-00003-of-00003.safetensors",
+        "decoder.block.3.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.3.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.3.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.3.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.3.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.3.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.3.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.3.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.3.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.3.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.3.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.3.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.4.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.4.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.4.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.4.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.4.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.4.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.4.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.4.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.4.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.4.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.4.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.4.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.5.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.5.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.5.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.5.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.5.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.5.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.5.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.5.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.5.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.5.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.5.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.5.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.6.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.6.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.6.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.6.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.6.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.6.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.6.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.6.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.6.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.6.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.6.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.6.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.7.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.7.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.7.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.7.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.7.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.7.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.7.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.7.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.7.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.7.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.7.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.7.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.8.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.8.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.8.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.8.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.8.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.8.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.8.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.8.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.8.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.8.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.8.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.8.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.9.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.9.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.9.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.9.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.9.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.9.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.9.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.9.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.9.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.9.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.9.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
+        "decoder.block.9.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
+        "decoder.embed_tokens.weight": "model-00001-of-00003.safetensors",
+        "decoder.final_layer_norm.weight": "model-00003-of-00003.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.0.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.0.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.0.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.1.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.1.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.1.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.1.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.1.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.1.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.1.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.10.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.10.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.10.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.10.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.10.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.10.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.10.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.11.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.11.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.11.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.11.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.11.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.11.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.11.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.12.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.12.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.12.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.12.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.12.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.12.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.12.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.13.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.13.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.13.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.13.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.13.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.13.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.13.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.14.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.14.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.14.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.14.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.14.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.14.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.14.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.15.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.15.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.15.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.15.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.15.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.15.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.15.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.16.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.16.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.16.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.16.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.16.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.16.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.16.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.17.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.17.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.17.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.17.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.17.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.17.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.17.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.18.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.18.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.18.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.18.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.18.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.18.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.18.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.19.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.19.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.19.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.19.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.19.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.19.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.19.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.2.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.2.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.2.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.2.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.2.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.2.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.2.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.20.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.20.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.20.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.20.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.20.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.20.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.20.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.21.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.21.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.21.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.21.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.21.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.21.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.21.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.22.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.22.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.22.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.22.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.22.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.22.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.22.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.23.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.23.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.23.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.23.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.23.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.23.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.23.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.3.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.3.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.3.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.3.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.3.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.3.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.3.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.4.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.4.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.4.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.4.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.4.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.4.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.4.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.5.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.5.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.5.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.5.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.5.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.5.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.5.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.6.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.6.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.6.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.6.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.6.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.6.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.6.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.7.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.7.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.7.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.7.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.7.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.7.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.7.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.8.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.8.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.8.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.8.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.8.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.8.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.8.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.9.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.9.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.9.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.9.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.9.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.9.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
+        "encoder.block.9.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
+        "encoder.embed_tokens.weight": "model-00001-of-00003.safetensors",
+        "encoder.final_layer_norm.weight": "model-00001-of-00003.safetensors",
+        "lm_head.weight": "model-00003-of-00003.safetensors",
+        "mm_projector.0.bias": "model-00003-of-00003.safetensors",
+        "mm_projector.0.weight": "model-00003-of-00003.safetensors",
+        "mm_projector.2.bias": "model-00003-of-00003.safetensors",
+        "mm_projector.2.weight": "model-00003-of-00003.safetensors",
+        "shared.weight": "model-00001-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.embeddings.class_embedding": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.pre_layrnorm.bias": "model-00003-of-00003.safetensors",
+        "vision_tower.vision_tower.vision_model.pre_layrnorm.weight": "model-00003-of-00003.safetensors"
+    }
+}
\ No newline at end of file
diff --git a/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/e7dbc990f8ede75b1ad2fd17028fbd89a950286a b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/e7dbc990f8ede75b1ad2fd17028fbd89a950286a
new file mode 100644
index 0000000000000000000000000000000000000000..e7dbc990f8ede75b1ad2fd17028fbd89a950286a
--- /dev/null
+++ b/hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/e7dbc990f8ede75b1ad2fd17028fbd89a950286a
@@ -0,0 +1,44 @@
+{
+  "_name_or_path": "google/flan-t5-xxl",
+  "architectures": [
+    "CLIPT5ForConditionalGeneration"
+  ],
+  "d_ff": 10240,
+  "d_kv": 64,
+  "d_model": 4096,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "freeze_mm_mlp_adapter": false,
+  "image_aspect_ratio": "pad",
+  "image_grid_pinpoints": null,
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "mm_hidden_size": 1024,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower": "openai/clip-vit-large-patch14-336",
+  "model_type": "t5",
+  "num_decoder_layers": 24,
+  "num_heads": 64,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "prefix_mask": false,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.31.0",
+  "tune_mm_mlp_adapter": false,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "vocab_size": 32128
+}
diff --git a/hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/config.json b/hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7dbc990f8ede75b1ad2fd17028fbd89a950286a
--- /dev/null
+++ b/hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/config.json
@@ -0,0 +1,44 @@
+{
+  "_name_or_path": "google/flan-t5-xxl",
+  "architectures": [
+    "CLIPT5ForConditionalGeneration"
+  ],
+  "d_ff": 10240,
+  "d_kv": 64,
+  "d_model": 4096,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "freeze_mm_mlp_adapter": false,
+  "image_aspect_ratio": "pad",
+  "image_grid_pinpoints": null,
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "mm_hidden_size": 1024,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower": "openai/clip-vit-large-patch14-336",
+  "model_type": "t5",
+  "num_decoder_layers": 24,
+  "num_heads": 64,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "prefix_mask": false,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.31.0",
+  "tune_mm_mlp_adapter": false,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "vocab_size": 32128
+}
diff --git a/hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/generation_config.json b/hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7528dbb1b6ce860d242aff71294a5fef12a41572
--- /dev/null
+++ b/hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.31.0"
+}
diff --git a/inference_ytvos.py b/inference_ytvos.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c24df6901de051432b5c975d819b6abcdb53bd4
--- /dev/null
+++ b/inference_ytvos.py
@@ -0,0 +1,326 @@
+'''
+Inference code for ReferFormer, on Ref-Youtube-VOS
+Modified from DETR (https://github.com/facebookresearch/detr)
+'''
+import argparse
+import json
+import random
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+
+import util.misc as utils
+from models import build_model
+import torchvision.transforms as T
+import matplotlib.pyplot as plt
+import os
+import cv2
+from PIL import Image, ImageDraw
+import math
+import torch.nn.functional as F
+import json
+
+import opts
+from tqdm import tqdm
+
+import multiprocessing as mp
+import threading
+
+from tools.colormap import colormap
+
+
+# colormap
+color_list = colormap()
+color_list = color_list.astype('uint8').tolist()
+
+# build transform
+transform = T.Compose([
+    T.Resize(360),
+    T.ToTensor(),
+    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+])
+    
+
+def main(args):
+    args.masks = True
+    args.batch_size == 1
+    print("Inference only supports for batch size = 1") 
+
+    # fix the seed for reproducibility
+    seed = args.seed + utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+    split = args.split
+    # save path
+    output_dir = args.output_dir
+    save_path_prefix = os.path.join(output_dir, split)
+    if not os.path.exists(save_path_prefix):
+        os.makedirs(save_path_prefix)
+
+    save_visualize_path_prefix = os.path.join(output_dir, split + '_images')
+    if args.visualize:
+        if not os.path.exists(save_visualize_path_prefix):
+            os.makedirs(save_visualize_path_prefix)
+
+    # load data
+    root = Path(args.ytvos_path) # data/ref-youtube-vos
+    img_folder = os.path.join(root, split, "JPEGImages")
+    meta_file = os.path.join(root, "meta_expressions", split, "meta_expressions.json")
+    with open(meta_file, "r") as f:
+        data = json.load(f)["videos"]
+    valid_test_videos = set(data.keys())
+    # for some reasons the competition's validation expressions dict contains both the validation (202) & 
+    # test videos (305). so we simply load the test expressions dict and use it to filter out the test videos from
+    # the validation expressions dict:
+    test_meta_file = os.path.join(root, "meta_expressions", "test", "meta_expressions.json")
+    with open(test_meta_file, 'r') as f:
+        test_data = json.load(f)['videos']
+    test_videos = set(test_data.keys())
+    valid_videos = valid_test_videos - test_videos
+    video_list = sorted([video for video in valid_videos])
+    assert len(video_list) == 202, 'error: incorrect number of validation videos'
+
+    # create subprocess
+    thread_num = args.ngpu
+    global result_dict
+    result_dict = mp.Manager().dict()
+
+    processes = []
+    lock = threading.Lock()
+
+    video_num = len(video_list)
+    per_thread_video_num = video_num // thread_num
+
+    start_time = time.time()
+    print('Start inference')
+    for i in range(thread_num):
+        if i == thread_num - 1:
+            sub_video_list = video_list[i * per_thread_video_num:]
+        else:
+            sub_video_list = video_list[i * per_thread_video_num: (i + 1) * per_thread_video_num]
+        p = mp.Process(target=sub_processor, args=(lock, i, args, data, 
+                                                   save_path_prefix, save_visualize_path_prefix, 
+                                                   img_folder, sub_video_list))
+        p.start()
+        processes.append(p)
+
+    for p in processes:
+        p.join()
+
+    end_time = time.time()
+    total_time = end_time - start_time
+
+    result_dict = dict(result_dict)
+    num_all_frames_gpus = 0
+    for pid, num_all_frames in result_dict.items():
+        num_all_frames_gpus += num_all_frames
+
+    print("Total inference time: %.4f s" %(total_time))
+
+def sub_processor(lock, pid, args, data, save_path_prefix, save_visualize_path_prefix, img_folder, video_list):
+    text = 'processor %d' % pid
+    with lock:
+        progress = tqdm(
+            total=len(video_list),
+            position=pid,
+            desc=text,
+            ncols=0
+        )
+    torch.cuda.set_device(pid)
+
+    # model
+    model, criterion, _ = build_model(args) 
+    device = args.device
+    model.to(device)
+
+    model_without_ddp = model
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+    if pid == 0:
+        print('number of params:', n_parameters)
+
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
+        unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))]
+        if len(missing_keys) > 0:
+            print('Missing Keys: {}'.format(missing_keys))
+        if len(unexpected_keys) > 0:
+            print('Unexpected Keys: {}'.format(unexpected_keys))
+    else:
+    	raise ValueError('Please specify the checkpoint for inference.')
+
+
+    # start inference
+    num_all_frames = 0 
+    model.eval()
+
+    # 1. For each video
+    for video in video_list:
+        metas = [] # list[dict], length is number of expressions
+
+        expressions = data[video]["expressions"]   
+        expression_list = list(expressions.keys()) 
+        num_expressions = len(expression_list)
+        video_len = len(data[video]["frames"])
+
+        # read all the anno meta
+        for i in range(num_expressions):
+            meta = {}
+            meta["video"] = video
+            meta["exp"] = expressions[expression_list[i]]["exp"]
+            meta["exp_id"] = expression_list[i]
+            meta["frames"] = data[video]["frames"]
+            metas.append(meta)
+        meta = metas
+
+        # 2. For each expression
+        for i in range(num_expressions):
+            video_name = meta[i]["video"]
+            exp = meta[i]["exp"]
+            exp_id = meta[i]["exp_id"]
+            frames = meta[i]["frames"]
+
+            video_len = len(frames)
+            # store images
+            imgs = []
+            for t in range(video_len):
+                frame = frames[t]
+                img_path = os.path.join(img_folder, video_name, frame + ".jpg")
+                img = Image.open(img_path).convert('RGB')
+                origin_w, origin_h = img.size
+                imgs.append(transform(img)) # list[img]
+
+            imgs = torch.stack(imgs, dim=0).to(args.device) # [video_len, 3, h, w]
+            img_h, img_w = imgs.shape[-2:]
+            size = torch.as_tensor([int(img_h), int(img_w)]).to(args.device)
+            target = {"size": size}
+
+            with torch.no_grad():
+                outputs = model([imgs], [exp], [target])
+            
+            pred_logits = outputs["pred_logits"][0] 
+            pred_boxes = outputs["pred_boxes"][0]   
+            pred_masks = outputs["pred_masks"][0]   
+            pred_ref_points = outputs["reference_points"][0]  
+
+            # according to pred_logits, select the query index
+            pred_scores = pred_logits.sigmoid() # [t, q, k]
+            pred_scores = pred_scores.mean(0)   # [q, k]
+            max_scores, _ = pred_scores.max(-1) # [q,]
+            _, max_ind = max_scores.max(-1)     # [1,]
+            max_inds = max_ind.repeat(video_len)
+            pred_masks = pred_masks[range(video_len), max_inds, ...] # [t, h, w]
+            pred_masks = pred_masks.unsqueeze(0)
+
+            pred_masks = F.interpolate(pred_masks, size=(origin_h, origin_w), mode='bilinear', align_corners=False) 
+            pred_masks = (pred_masks.sigmoid() > args.threshold).squeeze(0).detach().cpu().numpy() 
+
+            # store the video results
+            all_pred_logits = pred_logits[range(video_len), max_inds] 
+            all_pred_boxes = pred_boxes[range(video_len), max_inds]   
+            all_pred_ref_points = pred_ref_points[range(video_len), max_inds] 
+            all_pred_masks = pred_masks
+
+            if args.visualize:
+                for t, frame in enumerate(frames):
+                    # original
+                    img_path = os.path.join(img_folder, video_name, frame + '.jpg')
+                    source_img = Image.open(img_path).convert('RGBA') # PIL image
+
+                    draw = ImageDraw.Draw(source_img)
+                    draw_boxes = all_pred_boxes[t].unsqueeze(0) 
+                    draw_boxes = rescale_bboxes(draw_boxes.detach(), (origin_w, origin_h)).tolist()
+
+                    # draw boxes
+                    xmin, ymin, xmax, ymax = draw_boxes[0]
+                    draw.rectangle(((xmin, ymin), (xmax, ymax)), outline=tuple(color_list[i%len(color_list)]), width=2)
+
+                    # draw reference point
+                    ref_points = all_pred_ref_points[t].unsqueeze(0).detach().cpu().tolist() 
+                    draw_reference_points(draw, ref_points, source_img.size, color=color_list[i%len(color_list)])
+
+                    # draw mask
+                    source_img = vis_add_mask(source_img, all_pred_masks[t], color_list[i%len(color_list)])
+
+                    # save
+                    save_visualize_path_dir = os.path.join(save_visualize_path_prefix, video, str(i))
+                    if not os.path.exists(save_visualize_path_dir):
+                        os.makedirs(save_visualize_path_dir)
+                    save_visualize_path = os.path.join(save_visualize_path_dir, frame + '.png')
+                    source_img.save(save_visualize_path)
+
+
+            # save binary image
+            save_path = os.path.join(save_path_prefix, video_name, exp_id)
+            if not os.path.exists(save_path):
+                os.makedirs(save_path)
+            for j in range(video_len):
+                frame_name = frames[j]
+                mask = all_pred_masks[j].astype(np.float32) 
+                mask = Image.fromarray(mask * 255).convert('L')
+                save_file = os.path.join(save_path, frame_name + ".png")
+                mask.save(save_file)
+
+        with lock:
+            progress.update(1)
+    result_dict[str(pid)] = num_all_frames
+    with lock:
+        progress.close()
+
+
+# visuaize functions
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=1)
+
+def rescale_bboxes(out_bbox, size):
+    img_w, img_h = size
+    b = box_cxcywh_to_xyxy(out_bbox)
+    b = b.cpu() * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
+    return b
+
+
+# Visualization functions
+def draw_reference_points(draw, reference_points, img_size, color):
+    W, H = img_size
+    for i, ref_point in enumerate(reference_points):
+        init_x, init_y = ref_point
+        x, y = W * init_x, H * init_y
+        cur_color = color
+        draw.line((x-10, y, x+10, y), tuple(cur_color), width=4)
+        draw.line((x, y-10, x, y+10), tuple(cur_color), width=4)
+
+def draw_sample_points(draw, sample_points, img_size, color_list):
+    alpha = 255
+    for i, samples in enumerate(sample_points):
+        for sample in samples:
+            x, y = sample
+            cur_color = color_list[i % len(color_list)][::-1]
+            cur_color += [alpha]
+            draw.ellipse((x-2, y-2, x+2, y+2), 
+                            fill=tuple(cur_color), outline=tuple(cur_color), width=1)
+
+def vis_add_mask(img, mask, color):
+    origin_img = np.asarray(img.convert('RGB')).copy()
+    color = np.array(color)
+
+    mask = mask.reshape(mask.shape[0], mask.shape[1]).astype('uint8') # np
+    mask = mask > 0.5
+
+    origin_img[mask] = origin_img[mask] * 0.5 + color * 0.5
+    origin_img = Image.fromarray(origin_img)
+    return origin_img
+
+  
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer inference script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    main(args)
diff --git a/logs/gpt_ref-ytvos_numbered_cy_sanity.log b/logs/gpt_ref-ytvos_numbered_cy_sanity.log
new file mode 100644
index 0000000000000000000000000000000000000000..08b51ea286b4289e93268a3b1d435e245c74af17
--- /dev/null
+++ b/logs/gpt_ref-ytvos_numbered_cy_sanity.log
@@ -0,0 +1,5967 @@
+skipped 57 short videos
+
+ video num:  3471  clip num:  3414
+
+
+vid id: 003234408d
+
+-----------category name: penguin, frame name: 3
+are penguins distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: penguin, frame name: 12
+are penguins distinguished by action: YES
+
+-----------category name: penguin, frame name: 25
+are penguins distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: penguin, frame name: 32
+are penguins distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0043f083b5
+
+Skipping bus: There is single or no object.
+
+Skipping bus: There is single or no object.
+
+Skipping bus: There is single or no object.
+
+Skipping bus: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+-----------category name: sedan, frame name: 10
+are sedans distinguished by action: NONE
+
+-----------category name: sedan, frame name: 14
+are sedans distinguished by action: "NONE"
+
+vid id: 0044fa5fba
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 005a527edd
+
+-----------category name: ape, frame name: 4
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: ape, frame name: 9
+are apes distinguished by action: YES
+
+-----------category name: ape, frame name: 15
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: ape, frame name: 24
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0065b171f9
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 00917dcfc4
+
+-----------category name: zebra, frame name: 3
+are zebras distinguished by action: NONE
+
+-----------category name: zebra, frame name: 6
+are zebras distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: zebra, frame name: 12
+are zebras distinguished by action: YES
+
+-----------category name: zebra, frame name: 16
+are zebras distinguished by action: YES
+
+vid id: 00a23ccf53
+
+Skipping shark: There is single or no object.
+
+Skipping shark: There is single or no object.
+
+Skipping shark: There is single or no object.
+
+Skipping shark: There is single or no object.
+
+vid id: 00ad5016a4
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+vid id: 01082ae388
+
+Skipping leopard: There is single or no object.
+
+Skipping leopard: There is single or no object.
+
+Skipping leopard: There is single or no object.
+
+Skipping leopard: There is single or no object.
+
+vid id: 011ac0a06f
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+vid id: 013099c098
+
+-----------category name: giant_panda, frame name: 2
+are giant_pandas distinguished by action: YES
+
+-----------category name: giant_panda, frame name: 7
+are giant_pandas distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: giant_panda, frame name: 10
+are giant_pandas distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: giant_panda, frame name: 11
+are giant_pandas distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0155498c85
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+vid id: 01694ad9c8
+
+Skipping bird: There is single or no object.
+
+Skipping bird: There is single or no object.
+
+Skipping bird: There is single or no object.
+
+Skipping bird: There is single or no object.
+
+vid id: 017ac35701
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 01b80e8e1a
+
+-----------category name: zebra, frame name: 2
+are zebras distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: zebra, frame name: 5
+are zebras distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: zebra, frame name: 7
+are zebras distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: zebra, frame name: 9
+are zebras distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 01baa5a4e1
+
+Skipping frisbee: Determined to be non-movable.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+vid id: 01c3111683
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+vid id: 01c4cb5ffe
+
+Skipping person: There is single or no object.
+
+-----------category name: person, frame name: 15
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: person, frame name: 22
+are persons distinguished by action: NONE
+
+Skipping person: There is single or no object.
+
+Skipping snowboard: Determined to be non-movable.
+
+Skipping snowboard: There is single or no object.
+
+Skipping snowboard: There is single or no object.
+
+Skipping snowboard: There is single or no object.
+
+Skipping snowboard: There is single or no object.
+
+vid id: 01c76f0a82
+
+Skipping plant: Determined to be non-movable.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+-----------category name: sedan, frame name: 12
+are sedans distinguished by action: NONE
+
+-----------category name: sedan, frame name: 14
+are sedans distinguished by action: I'm unable to determine any actions or postures of sedans from images, as vehicles don't perform actions like people.
+
+vid id: 01c783268c
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+vid id: 01e64dd36a
+
+-----------category name: cow, frame name: 3
+are cows distinguished by action: NONE
+
+-----------category name: cow, frame name: 5
+are cows distinguished by action: YES
+
+-----------category name: cow, frame name: 10
+are cows distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: cow, frame name: 14
+are cows distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 01ed275c6e
+
+-----------category name: giraffe, frame name: 4
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: giraffe, frame name: 8
+are giraffes distinguished by action: YES
+
+-----------category name: giraffe, frame name: 12
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: giraffe, frame name: 16
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 01ff60d1fa
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+vid id: 020cd28cd2
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 02264db755
+
+Skipping fox: There is single or no object.
+
+Skipping fox: There is single or no object.
+
+Skipping fox: There is single or no object.
+
+Skipping fox: There is single or no object.
+
+vid id: 0248626d9a
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+vid id: 02668dbffa
+
+Skipping frog: There is single or no object.
+
+Skipping frog: There is single or no object.
+
+Skipping frog: There is single or no object.
+
+Skipping frog: There is single or no object.
+
+vid id: 0274193026
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 02d28375aa
+
+Skipping fox: There is single or no object.
+
+Skipping fox: There is single or no object.
+
+Skipping fox: There is single or no object.
+
+Skipping fox: There is single or no object.
+
+vid id: 031ccc99b1
+
+-----------category name: person, frame name: 4
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 5
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Skipping person: There is single or no object.
+
+-----------category name: person, frame name: 14
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+vid id: 0321b18c10
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+-----------category name: person, frame name: 3
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 7
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 8
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 13
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0348a45bca
+
+-----------category name: fish, frame name: 8
+are fishs distinguished by action: NONE
+
+-----------category name: fish, frame name: 16
+are fishs distinguished by action: NONE
+
+-----------category name: fish, frame name: 19
+are fishs distinguished by action: NONE
+
+-----------category name: fish, frame name: 27
+are fishs distinguished by action: NONE
+
+vid id: 0355e92655
+
+Skipping boat: There is single or no object.
+
+Skipping boat: There is single or no object.
+
+Skipping boat: There is single or no object.
+
+Skipping boat: There is single or no object.
+
+Skipping paddle: Determined to be non-movable.
+
+Skipping paddle: There is single or no object.
+
+Skipping paddle: There is single or no object.
+
+Skipping paddle: There is single or no object.
+
+Skipping paddle: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 0358b938c1
+
+Skipping elephant: There is single or no object.
+
+-----------category name: elephant, frame name: 7
+are elephants distinguished by action: YES
+
+-----------category name: elephant, frame name: 9
+are elephants distinguished by action: YES
+
+-----------category name: elephant, frame name: 16
+are elephants distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0368107cf1
+
+-----------category name: person, frame name: 2
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 6
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 10
+are persons distinguished by action: I'm sorry, I cannot identify or analyze individuals in the image provided.
+
+-----------category name: person, frame name: 15
+are persons distinguished by action: NONE
+
+vid id: 0379ddf557
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 038b2cc71d
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+vid id: 038c15a5dd
+
+Skipping hedgehog: There is single or no object.
+
+Skipping hedgehog: There is single or no object.
+
+Skipping hedgehog: There is single or no object.
+
+Skipping hedgehog: There is single or no object.
+
+vid id: 03a06cc98a
+
+-----------category name: giraffe, frame name: 5
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: giraffe, frame name: 8
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: giraffe, frame name: 12
+are giraffes distinguished by action: YES
+
+-----------category name: giraffe, frame name: 14
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+vid id: 03a63e187f
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+vid id: 03c95b4dae
+
+-----------category name: elephant, frame name: 3
+are elephants distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: elephant, frame name: 5
+are elephants distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: elephant, frame name: 10
+are elephants distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: elephant, frame name: 16
+are elephants distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 03e2b57b0e
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+vid id: 04194e1248
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+vid id: 04259896e2
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+vid id: 0444918a5f
+
+-----------category name: truck, frame name: 2
+are trucks distinguished by action: NONE
+
+-----------category name: truck, frame name: 9
+are trucks distinguished by action: NONE
+
+-----------category name: truck, frame name: 13
+are trucks distinguished by action: NONE
+
+-----------category name: truck, frame name: 16
+are trucks distinguished by action: NONE
+
+vid id: 04460a7a52
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+vid id: 04474174a4
+
+-----------category name: ape, frame name: 4
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: ape, frame name: 12
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: ape, frame name: 22
+are apes distinguished by action: NONE
+
+-----------category name: ape, frame name: 31
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0450095513
+
+Skipping snail: There is single or no object.
+
+Skipping snail: There is single or no object.
+
+Skipping snail: There is single or no object.
+
+Skipping snail: There is single or no object.
+
+vid id: 045f00aed2
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+Skipping others: Determined to be non-movable.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 04667fabaa
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+vid id: 04735c5030
+
+-----------category name: cat, frame name: 3
+are cats distinguished by action: YES
+
+-----------category name: cat, frame name: 6
+are cats distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: cat, frame name: 10
+are cats distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: cat, frame name: 15
+are cats distinguished by action: YES
+
+Retrying caption generation... (1/3)
+vid id: 04990d1915
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping truck: There is single or no object.
+
+Skipping truck: There is single or no object.
+
+Skipping truck: There is single or no object.
+
+Skipping truck: There is single or no object.
+
+Skipping bus: There is single or no object.
+
+Skipping bus: There is single or no object.
+
+Skipping bus: There is single or no object.
+
+Skipping bus: There is single or no object.
+
+vid id: 04d62d9d98
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 04f21da964
+
+Skipping monkey: There is single or no object.
+
+Skipping monkey: There is single or no object.
+
+Skipping monkey: There is single or no object.
+
+Skipping monkey: There is single or no object.
+
+vid id: 04fbad476e
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+vid id: 04fe256562
+
+Skipping truck: There is single or no object.
+
+Skipping truck: There is single or no object.
+
+Skipping truck: There is single or no object.
+
+Skipping truck: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+vid id: 0503bf89c9
+
+Skipping hedgehog: There is single or no object.
+
+Skipping hedgehog: There is single or no object.
+
+Skipping hedgehog: There is single or no object.
+
+Skipping hedgehog: There is single or no object.
+
+vid id: 0536c9eed0
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+vid id: 054acb238f
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+vid id: 05579ca250
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 056c200404
+
+Skipping toilet: Determined to be non-movable.
+
+Skipping toilet: There is single or no object.
+
+Skipping toilet: There is single or no object.
+
+Skipping toilet: There is single or no object.
+
+Skipping toilet: There is single or no object.
+
+vid id: 05774f3a2c
+
+-----------category name: ape, frame name: 4
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: ape, frame name: 13
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: ape, frame name: 25
+are apes distinguished by action: NONE
+
+-----------category name: ape, frame name: 33
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 058a7592c8
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+vid id: 05a0a513df
+
+-----------category name: person, frame name: 4
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 9
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 12
+are persons distinguished by action: "NONE"
+
+-----------category name: person, frame name: 15
+are persons distinguished by action: NONE
+
+vid id: 05a569d8aa
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+vid id: 05aa652648
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+vid id: 05d7715782
+
+Skipping sign: Determined to be non-movable.
+
+Skipping sign: There is single or no object.
+
+Skipping sign: There is single or no object.
+
+Skipping sign: There is single or no object.
+
+Skipping sign: There is single or no object.
+
+vid id: 05e0b0f28f
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+vid id: 05fdbbdd7a
+
+Skipping umbrella: Determined to be non-movable.
+
+vid id: 05ffcfed85
+
+Skipping monkey: There is single or no object.
+
+-----------category name: monkey, frame name: 15
+are monkeys distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: monkey, frame name: 22
+are monkeys distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Skipping monkey: There is single or no object.
+
+vid id: 0630391881
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping tennis_racket: Determined to be non-movable.
+
+Skipping tennis_racket: There is single or no object.
+
+Skipping tennis_racket: There is single or no object.
+
+Skipping tennis_racket: There is single or no object.
+
+Skipping tennis_racket: There is single or no object.
+
+vid id: 06840b2bbe
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+vid id: 068f7dce6f
+
+Skipping shark: There is single or no object.
+
+Skipping shark: There is single or no object.
+
+Skipping shark: There is single or no object.
+
+Skipping shark: There is single or no object.
+
+vid id: 0693719753
+
+-----------category name: turtle, frame name: 7
+are turtles distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: turtle, frame name: 12
+are turtles distinguished by action: NONE
+
+-----------category name: turtle, frame name: 15
+are turtles distinguished by action: "NONE"
+
+-----------category name: turtle, frame name: 20
+are turtles distinguished by action: NONE
+
+vid id: 06ce2b51fb
+
+Skipping paddle: Determined to be non-movable.
+
+Skipping paddle: There is single or no object.
+
+Skipping paddle: There is single or no object.
+
+Skipping paddle: There is single or no object.
+
+Skipping paddle: There is single or no object.
+
+-----------category name: person, frame name: 2
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 7
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 9
+are persons distinguished by action: "NONE"
+
+-----------category name: person, frame name: 12
+are persons distinguished by action: NONE
+
+vid id: 06e224798e
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+vid id: 06ee361788
+
+-----------category name: duck, frame name: 3
+are ducks distinguished by action: NONE
+
+-----------category name: duck, frame name: 6
+are ducks distinguished by action: YES
+
+-----------category name: duck, frame name: 10
+are ducks distinguished by action: NONE
+
+-----------category name: duck, frame name: 14
+are ducks distinguished by action: NONE
+
+vid id: 06fbb3fa2c
+
+Skipping eagle: There is single or no object.
+
+Skipping eagle: There is single or no object.
+
+Skipping eagle: There is single or no object.
+
+Skipping eagle: There is single or no object.
+
+vid id: 0700264286
+
+-----------category name: cow, frame name: 4
+are cows distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: cow, frame name: 6
+are cows distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: cow, frame name: 8
+are cows distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: cow, frame name: 12
+are cows distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 070c918ca7
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+vid id: 07129e14a4
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+-----------category name: parrot, frame name: 2
+are parrots distinguished by action: "YES"
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: parrot, frame name: 10
+are parrots distinguished by action: YES
+
+-----------category name: parrot, frame name: 22
+are parrots distinguished by action: "YES"
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: parrot, frame name: 30
+are parrots distinguished by action: NONE
+
+vid id: 07177017e9
+
+-----------category name: motorbike, frame name: 4
+are motorbikes distinguished by action: "NONE"
+
+-----------category name: motorbike, frame name: 6
+are motorbikes distinguished by action: NONE
+
+-----------category name: motorbike, frame name: 9
+are motorbikes distinguished by action: "NONE"
+
+-----------category name: motorbike, frame name: 13
+are motorbikes distinguished by action: "NONE"
+
+vid id: 07238ffc58
+
+-----------category name: monkey, frame name: 6
+are monkeys distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: monkey, frame name: 14
+are monkeys distinguished by action: YES
+
+-----------category name: monkey, frame name: 25
+are monkeys distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: monkey, frame name: 28
+are monkeys distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 07353b2a89
+
+-----------category name: sheep, frame name: 6
+are sheeps distinguished by action: NONE
+
+-----------category name: sheep, frame name: 8
+are sheeps distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: sheep, frame name: 17
+are sheeps distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: sheep, frame name: 25
+are sheeps distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0738493cbf
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+vid id: 075926c651
+
+-----------category name: person, frame name: 2
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 075c701292
+
+-----------category name: duck, frame name: 8
+are ducks distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: duck, frame name: 14
+are ducks distinguished by action: NONE
+
+-----------category name: duck, frame name: 18
+are ducks distinguished by action: NONE
+
+-----------category name: duck, frame name: 33
+are ducks distinguished by action: NONE
+
+vid id: 0762ea9a30
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 07652ee4af
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 076f206928
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+-----------category name: zebra, frame name: 3
+are zebras distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: zebra, frame name: 9
+are zebras distinguished by action: "NONE"
+
+-----------category name: zebra, frame name: 10
+are zebras distinguished by action: "NONE"
+
+-----------category name: zebra, frame name: 16
+are zebras distinguished by action: "NONE"
+
+vid id: 077d32af19
+
+-----------category name: person, frame name: 5
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 9
+are persons distinguished by action: "NONE"
+
+-----------category name: person, frame name: 10
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 14
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+vid id: 079049275c
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+vid id: 07913cdda7
+
+-----------category name: person, frame name: 4
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 7
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 9
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 11
+are persons distinguished by action: NONE
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+vid id: 07a11a35e8
+
+-----------category name: ape, frame name: 6
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: ape, frame name: 15
+are apes distinguished by action: YES
+
+-----------category name: ape, frame name: 21
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: ape, frame name: 29
+are apes distinguished by action: YES
+
+vid id: 07ac33b6df
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+vid id: 07c62c3d11
+
+-----------category name: parrot, frame name: 8
+are parrots distinguished by action: "NONE"
+
+-----------category name: parrot, frame name: 17
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: parrot, frame name: 18
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: parrot, frame name: 30
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 07cc1c7d74
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+vid id: 080196ef01
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping hand: Determined to be non-movable.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+vid id: 081207976e
+
+Skipping hat: Determined to be non-movable.
+
+vid id: 081ae4fa44
+
+-----------category name: shark, frame name: 2
+are sharks distinguished by action: NONE
+
+-----------category name: shark, frame name: 13
+are sharks distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: shark, frame name: 14
+are sharks distinguished by action: YES
+
+-----------category name: shark, frame name: 22
+are sharks distinguished by action: "NONE"
+
+vid id: 081d8250cb
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 082900c5d4
+
+-----------category name: duck, frame name: 4
+are ducks distinguished by action: NONE
+
+-----------category name: duck, frame name: 7
+are ducks distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: duck, frame name: 10
+are ducks distinguished by action: NONE
+
+-----------category name: duck, frame name: 17
+are ducks distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0860df21e2
+
+Skipping frisbee: Determined to be non-movable.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+vid id: 0866d4c5e3
+
+-----------category name: bird, frame name: 2
+are birds distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: bird, frame name: 6
+are birds distinguished by action: NONE
+
+-----------category name: bird, frame name: 8
+are birds distinguished by action: NONE
+
+-----------category name: bird, frame name: 11
+are birds distinguished by action: "NONE"
+
+vid id: 0891ac2eb6
+
+-----------category name: person, frame name: 3
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 6
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: person, frame name: 10
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 12
+are persons distinguished by action: YES
+
+vid id: 08931bc458
+
+Skipping others: Determined to be non-movable.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 08aa2705d5
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+vid id: 08c8450db7
+
+Skipping toilet: Determined to be non-movable.
+
+Skipping toilet: There is single or no object.
+
+Skipping toilet: There is single or no object.
+
+Skipping toilet: There is single or no object.
+
+Skipping toilet: There is single or no object.
+
+vid id: 08d50b926c
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+vid id: 08e1e4de15
+
+-----------category name: monkey, frame name: 2
+are monkeys distinguished by action: YES
+
+-----------category name: monkey, frame name: 10
+are monkeys distinguished by action: YES
+
+-----------category name: monkey, frame name: 22
+are monkeys distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: monkey, frame name: 32
+are monkeys distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 08e48c1a48
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+vid id: 08f561c65e
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 08feb87790
+
+Skipping sheep: There is single or no object.
+
+Skipping sheep: There is single or no object.
+
+Skipping sheep: There is single or no object.
+
+Skipping sheep: There is single or no object.
+
+vid id: 09049f6fe3
+
+-----------category name: mouse, frame name: 4
+are mouses distinguished by action: YES
+
+-----------category name: mouse, frame name: 11
+are mouses distinguished by action: "NONE"
+
+-----------category name: mouse, frame name: 16
+are mouses distinguished by action: "NONE"
+
+-----------category name: mouse, frame name: 27
+are mouses distinguished by action: NONE
+
+vid id: 092e4ff450
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+vid id: 09338adea8
+
+-----------category name: whale, frame name: 7
+are whales distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: whale, frame name: 11
+are whales distinguished by action: YES
+
+-----------category name: whale, frame name: 19
+are whales distinguished by action: NONE
+
+-----------category name: whale, frame name: 29
+are whales distinguished by action: "NONE"
+
+vid id: 093c335ccc
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping frisbee: Determined to be non-movable.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+vid id: 0970d28339
+
+-----------category name: ape, frame name: 8
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: ape, frame name: 17
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: ape, frame name: 22
+are apes distinguished by action: YES
+
+-----------category name: ape, frame name: 33
+are apes distinguished by action: YES
+
+vid id: 0974a213dc
+
+-----------category name: giraffe, frame name: 5
+are giraffes distinguished by action: YES
+
+-----------category name: giraffe, frame name: 7
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: giraffe, frame name: 10
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: giraffe, frame name: 17
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 097b471ed8
+
+-----------category name: cat, frame name: 2
+are cats distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: cat, frame name: 7
+are cats distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: cat, frame name: 12
+are cats distinguished by action: "NONE"
+
+-----------category name: cat, frame name: 14
+are cats distinguished by action: NONE
+
+vid id: 0990941758
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 09a348f4fa
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+vid id: 09a6841288
+
+-----------category name: duck, frame name: 4
+are ducks distinguished by action: "NONE"
+
+-----------category name: duck, frame name: 7
+are ducks distinguished by action: NONE
+
+-----------category name: duck, frame name: 13
+are ducks distinguished by action: NONE
+
+-----------category name: duck, frame name: 16
+are ducks distinguished by action: "NONE"
+
+vid id: 09c5bad17b
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+vid id: 09c9ce80c7
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 09ff54fef4
+
+-----------category name: fox, frame name: 5
+are foxs distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: fox, frame name: 10
+are foxs distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: fox, frame name: 17
+are foxs distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: fox, frame name: 22
+are foxs distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0a23765d15
+
+-----------category name: person, frame name: 3
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 6
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 9
+are persons distinguished by action: YES
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 0a275e7f12
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+vid id: 0a2f2bd294
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+vid id: 0a7a2514aa
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+vid id: 0a7b27fde9
+
+-----------category name: parrot, frame name: 6
+are parrots distinguished by action: "NONE"
+
+-----------category name: parrot, frame name: 12
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: parrot, frame name: 21
+are parrots distinguished by action: "NONE"
+
+-----------category name: parrot, frame name: 27
+are parrots distinguished by action: NONE
+
+vid id: 0a8c467cc3
+
+-----------category name: fish, frame name: 7
+are fishs distinguished by action: YES
+
+-----------category name: fish, frame name: 14
+are fishs distinguished by action: NONE
+
+-----------category name: fish, frame name: 24
+are fishs distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: fish, frame name: 31
+are fishs distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0ac8c560ae
+
+-----------category name: person, frame name: 4
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 9
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 13
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 14
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0b1627e896
+
+Skipping boat: There is single or no object.
+
+Skipping boat: There is single or no object.
+
+Skipping boat: There is single or no object.
+
+Skipping boat: There is single or no object.
+
+vid id: 0b285c47f6
+
+Skipping hand: Determined to be non-movable.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+vid id: 0b34ec1d55
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+vid id: 0b5b5e8e5a
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 0b68535614
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+Skipping hand: Determined to be non-movable.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+vid id: 0b6f9105fc
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+vid id: 0b7dbfa3cb
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+vid id: 0b9cea51ca
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+vid id: 0b9d012be8
+
+Skipping camel: There is single or no object.
+
+Skipping camel: There is single or no object.
+
+Skipping camel: There is single or no object.
+
+Skipping camel: There is single or no object.
+
+vid id: 0bcfc4177d
+
+Skipping truck: There is single or no object.
+
+Skipping truck: There is single or no object.
+
+Skipping truck: There is single or no object.
+
+Skipping truck: There is single or no object.
+
+vid id: 0bd37b23c1
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+vid id: 0bd864064c
+
+Skipping eagle: There is single or no object.
+
+Skipping eagle: There is single or no object.
+
+Skipping eagle: There is single or no object.
+
+Skipping eagle: There is single or no object.
+
+vid id: 0c11c6bf7b
+
+Skipping deer: There is single or no object.
+
+Skipping deer: There is single or no object.
+
+Skipping deer: There is single or no object.
+
+Skipping deer: There is single or no object.
+
+vid id: 0c26bc77ac
+
+Skipping crocodile: There is single or no object.
+
+Skipping crocodile: There is single or no object.
+
+Skipping crocodile: There is single or no object.
+
+Skipping crocodile: There is single or no object.
+
+vid id: 0c3a04798c
+
+Skipping duck: There is single or no object.
+
+Skipping duck: There is single or no object.
+
+Skipping duck: There is single or no object.
+
+Skipping duck: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+vid id: 0c44a9d545
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+vid id: 0c817cc390
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+Skipping hedgehog: There is single or no object.
+
+Skipping hedgehog: There is single or no object.
+
+Skipping hedgehog: There is single or no object.
+
+Skipping hedgehog: There is single or no object.
+
+vid id: 0ca839ee9a
+
+-----------category name: ape, frame name: 2
+are apes distinguished by action: NONE
+
+-----------category name: ape, frame name: 11
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: ape, frame name: 22
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: ape, frame name: 30
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0cd7ac0ac0
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+vid id: 0ce06e0121
+
+-----------category name: parrot, frame name: 7
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: parrot, frame name: 10
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: parrot, frame name: 21
+are parrots distinguished by action: YES
+
+-----------category name: parrot, frame name: 31
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0cfe974a89
+
+-----------category name: turtle, frame name: 7
+are turtles distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: turtle, frame name: 16
+are turtles distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: turtle, frame name: 18
+are turtles distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: turtle, frame name: 30
+are turtles distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0d2fcc0dcd
+
+-----------category name: zebra, frame name: 2
+are zebras distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: zebra, frame name: 8
+are zebras distinguished by action: YES
+
+-----------category name: zebra, frame name: 11
+are zebras distinguished by action: YES
+
+-----------category name: zebra, frame name: 16
+are zebras distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0d3aad05d2
+
+Skipping parachute: Determined to be non-movable.
+
+Skipping parachute: There is single or no object.
+
+Skipping parachute: There is single or no object.
+
+Skipping parachute: There is single or no object.
+
+Skipping parachute: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 0d40b015f4
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping snowboard: Determined to be non-movable.
+
+Skipping snowboard: There is single or no object.
+
+Skipping snowboard: There is single or no object.
+
+Skipping snowboard: There is single or no object.
+
+Skipping snowboard: There is single or no object.
+
+vid id: 0d97fba242
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+vid id: 0d9cc80d7e
+
+-----------category name: person, frame name: 2
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 3
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 4
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 7
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0dab85b6d3
+
+-----------category name: lizard, frame name: 2
+are lizards distinguished by action: YES
+
+-----------category name: lizard, frame name: 9
+are lizards distinguished by action: YES
+
+-----------category name: lizard, frame name: 16
+are lizards distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: lizard, frame name: 21
+are lizards distinguished by action: YES
+
+vid id: 0db5c427a5
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+vid id: 0dbaf284f1
+
+-----------category name: cat, frame name: 2
+are cats distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: cat, frame name: 9
+are cats distinguished by action: YES
+
+-----------category name: cat, frame name: 12
+are cats distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: cat, frame name: 15
+are cats distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+vid id: 0de4923598
+
+Skipping others: Determined to be non-movable.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+vid id: 0df28a9101
+
+-----------category name: turtle, frame name: 3
+are turtles distinguished by action: "NONE"
+
+-----------category name: turtle, frame name: 8
+are turtles distinguished by action: "NONE"
+
+-----------category name: turtle, frame name: 15
+are turtles distinguished by action: NONE
+
+-----------category name: turtle, frame name: 20
+are turtles distinguished by action: NONE
+
+vid id: 0e04f636c4
+
+Skipping frog: There is single or no object.
+
+Skipping frog: There is single or no object.
+
+Skipping frog: There is single or no object.
+
+Skipping frog: There is single or no object.
+
+vid id: 0e05f0e232
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+-----------category name: lizard, frame name: 32
+are lizards distinguished by action: "YES"
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 0e0930474b
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+-----------category name: person, frame name: 2
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 6
+are persons distinguished by action: "NONE"
+
+-----------category name: person, frame name: 10
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 12
+are persons distinguished by action: "NONE"
+
+vid id: 0e27472bea
+
+Skipping hand: Determined to be non-movable.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+vid id: 0e30020549
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+vid id: 0e621feb6c
+
+-----------category name: lizard, frame name: 4
+are lizards distinguished by action: YES
+
+-----------category name: lizard, frame name: 15
+are lizards distinguished by action: YES
+
+-----------category name: lizard, frame name: 23
+are lizards distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: lizard, frame name: 26
+are lizards distinguished by action: YES
+
+Retrying caption generation... (1/3)
+vid id: 0e803c7d73
+
+Skipping knife: Determined to be non-movable.
+
+Skipping hand: Determined to be non-movable.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+vid id: 0e9ebe4e3c
+
+Skipping truck: There is single or no object.
+
+Skipping truck: There is single or no object.
+
+Skipping truck: There is single or no object.
+
+Skipping truck: There is single or no object.
+
+vid id: 0e9f2785ec
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping umbrella: Determined to be non-movable.
+
+Skipping umbrella: There is single or no object.
+
+Skipping umbrella: There is single or no object.
+
+Skipping umbrella: There is single or no object.
+
+Skipping umbrella: There is single or no object.
+
+vid id: 0ea68d418b
+
+Skipping others: Determined to be non-movable.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+vid id: 0eb403a222
+
+Skipping knife: Determined to be non-movable.
+
+Skipping knife: There is single or no object.
+
+Skipping knife: There is single or no object.
+
+Skipping knife: There is single or no object.
+
+Skipping knife: There is single or no object.
+
+Skipping hand: Determined to be non-movable.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+vid id: 0ee92053d6
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 0eefca067f
+
+-----------category name: giant_panda, frame name: 3
+are giant_pandas distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: giant_panda, frame name: 9
+are giant_pandas distinguished by action: YES
+
+-----------category name: giant_panda, frame name: 14
+are giant_pandas distinguished by action: YES
+
+-----------category name: giant_panda, frame name: 20
+are giant_pandas distinguished by action: YES
+
+Retrying caption generation... (1/3)
+vid id: 0f17fa6fcb
+
+-----------category name: duck, frame name: 2
+are ducks distinguished by action: NONE
+
+-----------category name: duck, frame name: 15
+are ducks distinguished by action: YES
+
+-----------category name: duck, frame name: 25
+are ducks distinguished by action: "NONE"
+
+-----------category name: duck, frame name: 31
+are ducks distinguished by action: NONE
+
+vid id: 0f1ac8e9a3
+
+Skipping frog: There is single or no object.
+
+Skipping frog: There is single or no object.
+
+Skipping frog: There is single or no object.
+
+Skipping frog: There is single or no object.
+
+vid id: 0f202e9852
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+vid id: 0f2ab8b1ff
+
+-----------category name: dolphin, frame name: 4
+are dolphins distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: dolphin, frame name: 11
+are dolphins distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: dolphin, frame name: 18
+are dolphins distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: dolphin, frame name: 29
+are dolphins distinguished by action: NONE
+
+vid id: 0f51a78756
+
+Skipping sheep: There is single or no object.
+
+Skipping sheep: There is single or no object.
+
+Skipping sheep: There is single or no object.
+
+Skipping sheep: There is single or no object.
+
+vid id: 0f5fbe16b0
+
+-----------category name: raccoon, frame name: 6
+are raccoons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: raccoon, frame name: 17
+are raccoons distinguished by action: YES
+
+-----------category name: raccoon, frame name: 22
+are raccoons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: raccoon, frame name: 33
+are raccoons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+vid id: 0f6072077b
+
+-----------category name: person, frame name: 4
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: person, frame name: 7
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 10
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: person, frame name: 16
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+vid id: 0f6b69b2f4
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+vid id: 0f6c2163de
+
+Skipping snail: There is single or no object.
+
+Skipping snail: There is single or no object.
+
+Skipping snail: There is single or no object.
+
+Skipping snail: There is single or no object.
+
+vid id: 0f74ec5599
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 0f9683715b
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+vid id: 0fa7b59356
+
+Skipping duck: There is single or no object.
+
+Skipping duck: There is single or no object.
+
+Skipping duck: There is single or no object.
+
+Skipping duck: There is single or no object.
+
+vid id: 0fb173695b
+
+Skipping paddle: Determined to be non-movable.
+
+Skipping paddle: There is single or no object.
+
+Skipping paddle: There is single or no object.
+
+Skipping paddle: There is single or no object.
+
+Skipping paddle: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping hat: Determined to be non-movable.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+vid id: 0fc958cde2
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+vid id: 0fe7b1a621
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+vid id: 0ffcdb491c
+
+-----------category name: person, frame name: 2
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 8
+are persons distinguished by action: "NONE"
+
+-----------category name: person, frame name: 10
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 14
+are persons distinguished by action: "NONE"
+
+vid id: 101caff7d4
+
+-----------category name: giant_panda, frame name: 5
+are giant_pandas distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: giant_panda, frame name: 8
+are giant_pandas distinguished by action: NONE
+
+-----------category name: giant_panda, frame name: 10
+are giant_pandas distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: giant_panda, frame name: 17
+are giant_pandas distinguished by action: YES
+
+vid id: 1022fe8417
+
+-----------category name: person, frame name: 2
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 8
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 12
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 17
+are persons distinguished by action: YES
+
+vid id: 1032e80b37
+
+Skipping giraffe: There is single or no object.
+
+Skipping giraffe: There is single or no object.
+
+Skipping giraffe: There is single or no object.
+
+Skipping giraffe: There is single or no object.
+
+vid id: 103f501680
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+vid id: 104e64565f
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+vid id: 104f1ab997
+
+-----------category name: person, frame name: 2
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 6
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 9
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: person, frame name: 14
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+vid id: 106242403f
+
+-----------category name: person, frame name: 5
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 9
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 11
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 15
+are persons distinguished by action: NONE
+
+vid id: 10b31f5431
+
+-----------category name: person, frame name: 6
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 11
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 18
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 26
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 10eced835e
+
+-----------category name: giant_panda, frame name: 3
+are giant_pandas distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: giant_panda, frame name: 5
+are giant_pandas distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: giant_panda, frame name: 8
+are giant_pandas distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Skipping giant_panda: There is single or no object.
+
+vid id: 110d26fa3a
+
+Skipping shark: There is single or no object.
+
+Skipping shark: There is single or no object.
+
+Skipping shark: There is single or no object.
+
+Skipping shark: There is single or no object.
+
+vid id: 1122c1d16a
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+-----------category name: parrot, frame name: 5
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: parrot, frame name: 11
+are parrots distinguished by action: YES
+
+-----------category name: parrot, frame name: 25
+are parrots distinguished by action: NONE
+
+-----------category name: parrot, frame name: 33
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+vid id: 1145b49a5f
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+vid id: 11485838c2
+
+-----------category name: giraffe, frame name: 2
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: giraffe, frame name: 9
+are giraffes distinguished by action: YES
+
+-----------category name: giraffe, frame name: 12
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: giraffe, frame name: 16
+are giraffes distinguished by action: NONE
+
+vid id: 114e7676ec
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping surfboard: Determined to be non-movable.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+vid id: 1157472b95
+
+-----------category name: parrot, frame name: 7
+are parrots distinguished by action: YES
+
+-----------category name: parrot, frame name: 10
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: parrot, frame name: 23
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: parrot, frame name: 27
+are parrots distinguished by action: NONE
+
+vid id: 115ee1072c
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+vid id: 1171141012
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+vid id: 117757b4b8
+
+Skipping snail: There is single or no object.
+
+Skipping snail: There is single or no object.
+
+Skipping snail: There is single or no object.
+
+Skipping snail: There is single or no object.
+
+vid id: 1178932d2f
+
+-----------category name: person, frame name: 5
+are persons distinguished by action: "NONE"
+
+-----------category name: person, frame name: 11
+are persons distinguished by action: "NONE"
+
+-----------category name: person, frame name: 24
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 27
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+vid id: 117cc76bda
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+vid id: 1180cbf814
+
+-----------category name: fish, frame name: 3
+are fishs distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: fish, frame name: 17
+are fishs distinguished by action: NONE
+
+-----------category name: fish, frame name: 19
+are fishs distinguished by action: NONE
+
+-----------category name: fish, frame name: 32
+are fishs distinguished by action: NONE
+
+vid id: 1187bbd0e3
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+vid id: 1197e44b26
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 119cf20728
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping hand: Determined to be non-movable.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+vid id: 119dd54871
+
+-----------category name: lion, frame name: 5
+are lions distinguished by action: NONE
+
+-----------category name: lion, frame name: 10
+are lions distinguished by action: "NONE"
+
+-----------category name: lion, frame name: 19
+are lions distinguished by action: "NONE"
+
+-----------category name: lion, frame name: 24
+are lions distinguished by action: NONE
+
+vid id: 11a0c3b724
+
+-----------category name: mouse, frame name: 4
+are mouses distinguished by action: "NONE"
+
+-----------category name: mouse, frame name: 7
+are mouses distinguished by action: NONE
+
+-----------category name: mouse, frame name: 10
+are mouses distinguished by action: YES
+
+-----------category name: mouse, frame name: 15
+are mouses distinguished by action: "NONE"
+
+vid id: 11a6ba8c94
+
+-----------category name: person, frame name: 9
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 15
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 20
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 29
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 11c722a456
+
+-----------category name: turtle, frame name: 2
+are turtles distinguished by action: NONE
+
+-----------category name: turtle, frame name: 15
+are turtles distinguished by action: NONE
+
+-----------category name: turtle, frame name: 19
+are turtles distinguished by action: YES
+
+-----------category name: turtle, frame name: 26
+are turtles distinguished by action: NONE
+
+vid id: 11cbcb0b4d
+
+Skipping zebra: There is single or no object.
+
+Skipping zebra: There is single or no object.
+
+Skipping zebra: There is single or no object.
+
+Skipping zebra: There is single or no object.
+
+vid id: 11ccf5e99d
+
+Skipping plant: Determined to be non-movable.
+
+Skipping plant: There is single or no object.
+
+Skipping plant: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 11ce6f452e
+
+-----------category name: person, frame name: 4
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 6
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 8
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: person, frame name: 12
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 11feabe596
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+Skipping rabbit: There is single or no object.
+
+vid id: 120cb9514d
+
+-----------category name: person, frame name: 9
+are persons distinguished by action: "YES"
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 10
+are persons distinguished by action: "YES"
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 25
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: person, frame name: 27
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+vid id: 12156b25b3
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping surfboard: Determined to be non-movable.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+vid id: 122896672d
+
+Skipping others: Determined to be non-movable.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+Skipping others: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 1233ac8596
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+vid id: 1239c87234
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+vid id: 1250423f7c
+
+-----------category name: elephant, frame name: 3
+are elephants distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: elephant, frame name: 6
+are elephants distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping hat: Determined to be non-movable.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+vid id: 1257a1bc67
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+Skipping snake: There is single or no object.
+
+vid id: 125d1b19dd
+
+-----------category name: giant_panda, frame name: 4
+are giant_pandas distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: giant_panda, frame name: 12
+are giant_pandas distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: giant_panda, frame name: 20
+are giant_pandas distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: giant_panda, frame name: 28
+are giant_pandas distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 126d203967
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 1295e19071
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+vid id: 12ad198c54
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 12bddb2bcb
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping frisbee: Determined to be non-movable.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+vid id: 12ec9b93ee
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 12eebedc35
+
+Skipping bird: There is single or no object.
+
+Skipping bird: There is single or no object.
+
+Skipping bird: There is single or no object.
+
+Skipping bird: There is single or no object.
+
+vid id: 132852e094
+
+Skipping fox: There is single or no object.
+
+Skipping fox: There is single or no object.
+
+Skipping fox: There is single or no object.
+
+Skipping fox: There is single or no object.
+
+vid id: 1329409f2a
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+vid id: 13325cfa14
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping umbrella: Determined to be non-movable.
+
+Skipping umbrella: There is single or no object.
+
+Skipping umbrella: There is single or no object.
+
+Skipping umbrella: There is single or no object.
+
+Skipping umbrella: There is single or no object.
+
+vid id: 1336440745
+
+Skipping mouse: There is single or no object.
+
+-----------category name: mouse, frame name: 14
+are mouses distinguished by action: "NONE"
+
+-----------category name: mouse, frame name: 23
+are mouses distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+Skipping mouse: There is single or no object.
+
+vid id: 134d06dbf9
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+vid id: 135625b53d
+
+Skipping hand: Determined to be non-movable.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+vid id: 13870016f9
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+-----------category name: cow, frame name: 3
+are cows distinguished by action: NONE
+
+-----------category name: cow, frame name: 6
+are cows distinguished by action: "NONE"
+
+-----------category name: cow, frame name: 9
+are cows distinguished by action: "NONE"
+
+-----------category name: cow, frame name: 12
+are cows distinguished by action: NONE
+
+vid id: 13960b3c84
+
+-----------category name: giraffe, frame name: 2
+are giraffes distinguished by action: YES
+
+-----------category name: giraffe, frame name: 7
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: giraffe, frame name: 12
+are giraffes distinguished by action: YES
+
+-----------category name: giraffe, frame name: 14
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 13adaad9d9
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 13ae097e20
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 13e3070469
+
+-----------category name: zebra, frame name: 4
+are zebras distinguished by action: "NONE"
+
+-----------category name: zebra, frame name: 6
+are zebras distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: zebra, frame name: 13
+are zebras distinguished by action: NONE
+
+-----------category name: zebra, frame name: 14
+are zebras distinguished by action: NONE
+
+vid id: 13f6a8c20d
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+vid id: 1416925cf2
+
+-----------category name: truck, frame name: 3
+are trucks distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: truck, frame name: 5
+are trucks distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: truck, frame name: 10
+are trucks distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: truck, frame name: 14
+are trucks distinguished by action: YES
+
+vid id: 142d2621f5
+
+-----------category name: person, frame name: 2
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: person, frame name: 7
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: person, frame name: 8
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: person, frame name: 13
+are persons distinguished by action: YES
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+vid id: 145d5d7c03
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 145fdc3ac5
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+vid id: 1471274fa7
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 14a6b5a139
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+vid id: 14c21cea0d
+
+-----------category name: monkey, frame name: 8
+are monkeys distinguished by action: NONE
+
+-----------category name: monkey, frame name: 11
+are monkeys distinguished by action: "NONE"
+
+-----------category name: monkey, frame name: 24
+are monkeys distinguished by action: NONE
+
+-----------category name: monkey, frame name: 28
+are monkeys distinguished by action: YES
+
+Retrying caption generation... (1/3)
+vid id: 14dae0dc93
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping umbrella: Determined to be non-movable.
+
+Skipping umbrella: There is single or no object.
+
+Skipping umbrella: There is single or no object.
+
+Skipping umbrella: There is single or no object.
+
+Skipping umbrella: There is single or no object.
+
+vid id: 14f9bd22b5
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+vid id: 14fd28ae99
+
+Skipping hand: Determined to be non-movable.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+vid id: 15097d5d4e
+
+Skipping hand: Determined to be non-movable.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+vid id: 150ea711f2
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+vid id: 1514e3563f
+
+-----------category name: earless_seal, frame name: 4
+are earless_seals distinguished by action: I'm sorry, I can't assist with that.
+
+-----------category name: earless_seal, frame name: 15
+are earless_seals distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+Skipping earless_seal: There is single or no object.
+
+Skipping earless_seal: There is single or no object.
+
+vid id: 152aaa3a9e
+
+Skipping raccoon: There is single or no object.
+
+Skipping raccoon: There is single or no object.
+
+Skipping raccoon: There is single or no object.
+
+Skipping raccoon: There is single or no object.
+
+vid id: 152b7d3bd7
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 15617297cc
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping surfboard: Determined to be non-movable.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+vid id: 15abbe0c52
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 15d1fb3de5
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+vid id: 15f67b0fab
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 161eb59aad
+
+Skipping giraffe: There is single or no object.
+
+Skipping giraffe: There is single or no object.
+
+Skipping giraffe: There is single or no object.
+
+Skipping giraffe: There is single or no object.
+
+-----------category name: cow, frame name: 5
+are cows distinguished by action: NONE
+
+-----------category name: cow, frame name: 9
+are cows distinguished by action: NONE
+
+-----------category name: cow, frame name: 10
+are cows distinguished by action: NONE
+
+-----------category name: cow, frame name: 15
+are cows distinguished by action: NONE
+
+vid id: 16288ea47f
+
+-----------category name: duck, frame name: 8
+are ducks distinguished by action: YES
+
+-----------category name: duck, frame name: 14
+are ducks distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: duck, frame name: 19
+are ducks distinguished by action: YES
+
+-----------category name: duck, frame name: 33
+are ducks distinguished by action: YES
+
+vid id: 164410ce62
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 165c3c8cd4
+
+-----------category name: person, frame name: 5
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: person, frame name: 9
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 11
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: person, frame name: 14
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 165c42b41b
+
+-----------category name: person, frame name: 3
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 7
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 9
+are persons distinguished by action: NONE
+
+Skipping person: There is single or no object.
+
+-----------category name: motorbike, frame name: 3
+are motorbikes distinguished by action: NONE
+
+Skipping motorbike: There is single or no object.
+
+-----------category name: motorbike, frame name: 9
+are motorbikes distinguished by action: NONE
+
+Skipping motorbike: There is single or no object.
+
+vid id: 165ec9e22b
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 1669502269
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 16763cccbb
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+vid id: 16adde065e
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping cat: There is single or no object.
+
+Skipping hat: Determined to be non-movable.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+vid id: 16af445362
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+vid id: 16afd538ad
+
+-----------category name: parrot, frame name: 5
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: parrot, frame name: 10
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: parrot, frame name: 18
+are parrots distinguished by action: "YES"
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: parrot, frame name: 26
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+vid id: 16c3fa4d5d
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+vid id: 16d1d65c27
+
+Skipping monkey: There is single or no object.
+
+Skipping monkey: There is single or no object.
+
+Skipping monkey: There is single or no object.
+
+Skipping monkey: There is single or no object.
+
+vid id: 16e8599e94
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 16fe9fb444
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+vid id: 1705796b02
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+vid id: 1724db7671
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 17418e81ea
+
+Skipping shark: There is single or no object.
+
+Skipping shark: There is single or no object.
+
+Skipping shark: There is single or no object.
+
+Skipping shark: There is single or no object.
+
+vid id: 175169edbb
+
+-----------category name: ape, frame name: 4
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: ape, frame name: 13
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: ape, frame name: 16
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: ape, frame name: 23
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 17622326fd
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+vid id: 17656bae77
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+Skipping elephant: There is single or no object.
+
+vid id: 17b0d94172
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+vid id: 17c220e4f6
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 17c7bcd146
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+vid id: 17cb4afe89
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+Skipping tiger: There is single or no object.
+
+vid id: 17cd79a434
+
+Skipping squirrel: There is single or no object.
+
+Skipping squirrel: There is single or no object.
+
+Skipping squirrel: There is single or no object.
+
+Skipping squirrel: There is single or no object.
+
+vid id: 17d18604c3
+
+Skipping plant: Determined to be non-movable.
+
+Skipping plant: There is single or no object.
+
+Skipping plant: There is single or no object.
+
+Skipping plant: There is single or no object.
+
+Skipping plant: There is single or no object.
+
+-----------category name: person, frame name: 4
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 8
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 10
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: person, frame name: 14
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 17d8ca1a37
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 17e33f4330
+
+Skipping monkey: There is single or no object.
+
+Skipping monkey: There is single or no object.
+
+Skipping monkey: There is single or no object.
+
+Skipping monkey: There is single or no object.
+
+vid id: 17f7a6d805
+
+Skipping snail: There is single or no object.
+
+Skipping snail: There is single or no object.
+
+Skipping snail: There is single or no object.
+
+Skipping snail: There is single or no object.
+
+vid id: 180abc8378
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+Skipping owl: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 183ba3d652
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping hat: Determined to be non-movable.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+vid id: 185bf64702
+
+Skipping zebra: There is single or no object.
+
+-----------category name: zebra, frame name: 6
+are zebras distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+Skipping zebra: There is single or no object.
+
+-----------category name: zebra, frame name: 15
+are zebras distinguished by action: YES
+
+vid id: 18913cc690
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+vid id: 1892651815
+
+Skipping camel: There is single or no object.
+
+Skipping camel: There is single or no object.
+
+Skipping camel: There is single or no object.
+
+Skipping camel: There is single or no object.
+
+vid id: 189ac8208a
+
+-----------category name: giraffe, frame name: 2
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: giraffe, frame name: 6
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: giraffe, frame name: 8
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: giraffe, frame name: 11
+are giraffes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 189b44e92c
+
+Skipping zebra: There is single or no object.
+
+Skipping zebra: There is single or no object.
+
+Skipping zebra: There is single or no object.
+
+Skipping zebra: There is single or no object.
+
+vid id: 18ac264b76
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 18b245ab49
+
+-----------category name: penguin, frame name: 4
+are penguins distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: penguin, frame name: 5
+are penguins distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: penguin, frame name: 10
+are penguins distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: penguin, frame name: 13
+are penguins distinguished by action: YES
+
+Retrying caption generation... (1/3)
+vid id: 18b5cebc34
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+Skipping mouse: There is single or no object.
+
+vid id: 18bad52083
+
+-----------category name: parrot, frame name: 2
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: parrot, frame name: 11
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: parrot, frame name: 18
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: parrot, frame name: 31
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 18bb5144d5
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+vid id: 18c6f205c5
+
+-----------category name: person, frame name: 4
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 6
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 10
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 14
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+vid id: 1903f9ea15
+
+-----------category name: bird, frame name: 4
+are birds distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: bird, frame name: 6
+are birds distinguished by action: NONE
+
+-----------category name: bird, frame name: 10
+are birds distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: bird, frame name: 14
+are birds distinguished by action: NONE
+
+vid id: 1917b209f2
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+-----------category name: cow, frame name: 4
+are cows distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: cow, frame name: 7
+are cows distinguished by action: "YES"
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: cow, frame name: 8
+are cows distinguished by action: "YES"
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: cow, frame name: 16
+are cows distinguished by action: "YES"
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+Skipping horse: There is single or no object.
+
+Skipping horse: There is single or no object.
+
+Skipping horse: There is single or no object.
+
+Skipping horse: There is single or no object.
+
+vid id: 191e74c01d
+
+Skipping deer: There is single or no object.
+
+Skipping deer: There is single or no object.
+
+Skipping deer: There is single or no object.
+
+Skipping deer: There is single or no object.
+
+vid id: 19367bb94e
+
+-----------category name: fish, frame name: 9
+are fishs distinguished by action: NONE
+
+-----------category name: fish, frame name: 17
+are fishs distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: fish, frame name: 24
+are fishs distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: fish, frame name: 26
+are fishs distinguished by action: NONE
+
+vid id: 193ffaa217
+
+-----------category name: person, frame name: 2
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 7
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 8
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 13
+are persons distinguished by action: NONE
+
+vid id: 19696b67d3
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+vid id: 197f3ab6f3
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 1981e763cc
+
+-----------category name: sheep, frame name: 2
+are sheeps distinguished by action: YES
+
+-----------category name: sheep, frame name: 17
+are sheeps distinguished by action: NONE
+
+-----------category name: sheep, frame name: 20
+are sheeps distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: sheep, frame name: 29
+are sheeps distinguished by action: YES
+
+Retrying caption generation... (1/3)
+vid id: 198afe39ae
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping surfboard: Determined to be non-movable.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+vid id: 19a6e62b9b
+
+Skipping monkey: There is single or no object.
+
+Skipping monkey: There is single or no object.
+
+Skipping monkey: There is single or no object.
+
+-----------category name: monkey, frame name: 24
+are monkeys distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 19b60d5335
+
+Skipping hand: Determined to be non-movable.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hedgehog: There is single or no object.
+
+Skipping hedgehog: There is single or no object.
+
+Skipping hedgehog: There is single or no object.
+
+Skipping hedgehog: There is single or no object.
+
+vid id: 19c00c11f9
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping surfboard: Determined to be non-movable.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+vid id: 19e061eb88
+
+-----------category name: boat, frame name: 4
+are boats distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: boat, frame name: 5
+are boats distinguished by action: NONE
+
+-----------category name: boat, frame name: 10
+are boats distinguished by action: NONE
+
+-----------category name: boat, frame name: 15
+are boats distinguished by action: NONE
+
+vid id: 19e8bc6178
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+vid id: 19ee80dac6
+
+-----------category name: person, frame name: 3
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 17
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping surfboard: Determined to be non-movable.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+Skipping surfboard: There is single or no object.
+
+vid id: 1a25a9170a
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+Skipping cow: There is single or no object.
+
+-----------category name: person, frame name: 6
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: person, frame name: 16
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 23
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: person, frame name: 26
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 1a359a6c1a
+
+Skipping sheep: There is single or no object.
+
+Skipping sheep: There is single or no object.
+
+Skipping sheep: There is single or no object.
+
+Skipping sheep: There is single or no object.
+
+vid id: 1a3e87c566
+
+Skipping frog: There is single or no object.
+
+Skipping frog: There is single or no object.
+
+Skipping frog: There is single or no object.
+
+Skipping frog: There is single or no object.
+
+vid id: 1a5fe06b00
+
+Skipping bus: There is single or no object.
+
+Skipping bus: There is single or no object.
+
+Skipping bus: There is single or no object.
+
+Skipping bus: There is single or no object.
+
+vid id: 1a6c0fbd1e
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 1a6f3b5a4b
+
+Skipping bike: Determined to be non-movable.
+
+Skipping bike: There is single or no object.
+
+Skipping bike: There is single or no object.
+
+Skipping bike: There is single or no object.
+
+Skipping bike: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping sedan: There is single or no object.
+
+Skipping hand: Determined to be non-movable.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+vid id: 1a8afbad92
+
+-----------category name: zebra, frame name: 3
+are zebras distinguished by action: "NONE"
+
+-----------category name: zebra, frame name: 5
+are zebras distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: zebra, frame name: 10
+are zebras distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Skipping zebra: There is single or no object.
+
+vid id: 1a8bdc5842
+
+-----------category name: parrot, frame name: 3
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: parrot, frame name: 11
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: parrot, frame name: 14
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: parrot, frame name: 23
+are parrots distinguished by action: YES
+
+Retrying caption generation... (1/3)
+vid id: 1a95752aca
+
+-----------category name: duck, frame name: 4
+are ducks distinguished by action: NONE
+
+-----------category name: duck, frame name: 10
+are ducks distinguished by action: "NONE"
+
+-----------category name: duck, frame name: 14
+are ducks distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: duck, frame name: 27
+are ducks distinguished by action: "NONE"
+
+vid id: 1a9c131cb7
+
+-----------category name: ape, frame name: 6
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: ape, frame name: 17
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: ape, frame name: 20
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: ape, frame name: 27
+are apes distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 1aa3da3ee3
+
+-----------category name: sheep, frame name: 2
+are sheeps distinguished by action: "NONE"
+
+-----------category name: sheep, frame name: 9
+are sheeps distinguished by action: "NONE"
+
+-----------category name: sheep, frame name: 15
+are sheeps distinguished by action: "NONE"
+
+-----------category name: sheep, frame name: 25
+are sheeps distinguished by action: "NONE"
+
+vid id: 1ab27ec7ea
+
+Skipping deer: There is single or no object.
+
+Skipping deer: There is single or no object.
+
+Skipping deer: There is single or no object.
+
+Skipping deer: There is single or no object.
+
+vid id: 1abf16d21d
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+vid id: 1acd0f993b
+
+Skipping frisbee: Determined to be non-movable.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+Skipping frisbee: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+Skipping dog: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+vid id: 1ad202e499
+
+-----------category name: lizard, frame name: 6
+are lizards distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: lizard, frame name: 14
+are lizards distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: lizard, frame name: 22
+are lizards distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: lizard, frame name: 31
+are lizards distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 1af8d2395d
+
+Skipping parachute: Determined to be non-movable.
+
+Skipping parachute: There is single or no object.
+
+Skipping parachute: There is single or no object.
+
+Skipping parachute: There is single or no object.
+
+Skipping parachute: There is single or no object.
+
+-----------category name: person, frame name: 6
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: person, frame name: 13
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 20
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 28
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+vid id: 1afd39a1fa
+
+Skipping hand: Determined to be non-movable.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping hand: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+Skipping motorbike: There is single or no object.
+
+vid id: 1b2d31306f
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+Skipping lizard: There is single or no object.
+
+vid id: 1b3fa67f0e
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+Skipping airplane: There is single or no object.
+
+vid id: 1b43fa74b4
+
+-----------category name: owl, frame name: 7
+are owls distinguished by action: NONE
+
+-----------category name: owl, frame name: 12
+are owls distinguished by action: "NONE"
+
+-----------category name: owl, frame name: 19
+are owls distinguished by action: NONE
+
+-----------category name: owl, frame name: 20
+are owls distinguished by action: NONE
+
+vid id: 1b73ea9fc2
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+Skipping parrot: There is single or no object.
+
+vid id: 1b7e8bb255
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping hat: Determined to be non-movable.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+Skipping hat: There is single or no object.
+
+vid id: 1b8680f8cd
+
+Skipping tennis_racket: Determined to be non-movable.
+
+Skipping tennis_racket: There is single or no object.
+
+Skipping tennis_racket: There is single or no object.
+
+Skipping tennis_racket: There is single or no object.
+
+Skipping tennis_racket: There is single or no object.
+
+-----------category name: person, frame name: 6
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 9
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 14
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 21
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 1b883843c0
+
+-----------category name: person, frame name: 4
+are persons distinguished by action: NONE
+
+-----------category name: person, frame name: 7
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 9
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: person, frame name: 14
+are persons distinguished by action: "NONE"
+
+vid id: 1b8898785b
+
+-----------category name: monkey, frame name: 9
+are monkeys distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: monkey, frame name: 14
+are monkeys distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: monkey, frame name: 22
+are monkeys distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Skipping monkey: There is single or no object.
+
+vid id: 1b88ba1aa4
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+Skipping giant_panda: There is single or no object.
+
+vid id: 1b96a498e5
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+Skipping ape: There is single or no object.
+
+vid id: 1bbc4c274f
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+Skipping fish: There is single or no object.
+
+vid id: 1bd87fe9ab
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+vid id: 1c4090c75b
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+Skipping whale: There is single or no object.
+
+vid id: 1c41934f84
+
+-----------category name: elephant, frame name: 5
+are elephants distinguished by action: YES
+
+-----------category name: elephant, frame name: 6
+are elephants distinguished by action: "NONE"
+
+-----------category name: elephant, frame name: 13
+are elephants distinguished by action: NONE
+
+-----------category name: elephant, frame name: 16
+are elephants distinguished by action: NONE
+
+vid id: 1c72b04b56
+
+Skipping lion: There is single or no object.
+
+Skipping lion: There is single or no object.
+
+Skipping lion: There is single or no object.
+
+Skipping lion: There is single or no object.
+
+vid id: 1c87955a3a
+
+Skipping crocodile: There is single or no object.
+
+Skipping crocodile: There is single or no object.
+
+Skipping crocodile: There is single or no object.
+
+Skipping crocodile: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+Skipping turtle: There is single or no object.
+
+vid id: 1c9f9eb792
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping person: There is single or no object.
+
+Skipping skateboard: Determined to be non-movable.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+Skipping skateboard: There is single or no object.
+
+vid id: 1ca240fede
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+Skipping train: There is single or no object.
+
+vid id: 1ca5673803
+
+-----------category name: person, frame name: 8
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: person, frame name: 14
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 21
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: person, frame name: 27
+are persons distinguished by action: YES
+
+Skipping tennis_racket: Determined to be non-movable.
+
+Skipping tennis_racket: There is single or no object.
+
+Skipping tennis_racket: There is single or no object.
+
+Skipping tennis_racket: There is single or no object.
+
+Skipping tennis_racket: There is single or no object.
+
+vid id: 1cada35274
+
+Skipping duck: There is single or no object.
+
+Skipping duck: There is single or no object.
+
+Skipping duck: There is single or no object.
+
+Skipping duck: There is single or no object.
+
+vid id: 1cb44b920d
+
+-----------category name: eagle, frame name: 5
+are eagles distinguished by action: YES
+
+Retrying caption generation... (1/3)
+-----------category name: eagle, frame name: 13
+are eagles distinguished by action: YES
+
+-----------category name: eagle, frame name: 18
+are eagles distinguished by action: YES
+
+-----------category name: eagle, frame name: 27
+are eagles distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 1cd10e62be
+
+Skipping leopard: There is single or no object.
+
+Skipping leopard: There is single or no object.
+
+Skipping leopard: There is single or no object.
+
+Skipping leopard: There is single or no object.
+
+vid id: 1d3087d5e5
+
+-----------category name: fish, frame name: 5
+are fishs distinguished by action: NONE
+
+-----------category name: fish, frame name: 11
+are fishs distinguished by action: NONE
+
+-----------category name: fish, frame name: 23
+are fishs distinguished by action: NONE
+
+-----------category name: fish, frame name: 33
+are fishs distinguished by action: NONE
+
+vid id: 1d3685150a
+
+Skipping sign: Determined to be non-movable.
+
+Skipping sign: There is single or no object.
+
+Skipping sign: There is single or no object.
+
+Skipping sign: There is single or no object.
+
+Skipping sign: There is single or no object.
+
+-----------category name: person, frame name: 2
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 6
+are persons distinguished by action: "YES"
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: person, frame name: 8
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 15
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+vid id: 1d6ff083aa
+
+-----------category name: person, frame name: 2
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+-----------category name: person, frame name: 9
+are persons distinguished by action: YES
+
+-----------category name: person, frame name: 10
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+-----------category name: person, frame name: 17
+are persons distinguished by action: YES
+
+Retrying caption generation... (1/3)
+Retrying caption generation... (2/3)
+Retrying caption generation... (3/3)
+Max retries reached. Caption generation failed.
+Finished!
diff --git a/models/ops/functions/__init__.py b/models/ops/functions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a2197bda3199aa32cafc5b9d396479609853dd2
--- /dev/null
+++ b/models/ops/functions/__init__.py
@@ -0,0 +1,10 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from .ms_deform_attn_func import MSDeformAttnFunction
+
diff --git a/models/ops/setup.py b/models/ops/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0131bc21cf1b45b90fcf174e2c53e4c08e9c641
--- /dev/null
+++ b/models/ops/setup.py
@@ -0,0 +1,71 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+import os
+import glob
+
+import torch
+
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+from setuptools import find_packages
+from setuptools import setup
+
+requirements = ["torch", "torchvision"]
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+
+    sources = main_file + source_cpu
+    extension = CppExtension
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+    else:
+        raise NotImplementedError('Cuda is not availabel')
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            "MultiScaleDeformableAttention",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+
+setup(
+    name="MultiScaleDeformableAttention",
+    version="1.0",
+    author="Weijie Su",
+    url="https://github.com/fundamentalvision/Deformable-DETR",
+    description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
+    packages=find_packages(exclude=("configs", "tests",)),
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
diff --git a/models/ops/src/cpu/ms_deform_attn_cpu.h b/models/ops/src/cpu/ms_deform_attn_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..81b7b58a3d9502bbb684dc84687a526dedf94cae
--- /dev/null
+++ b/models/ops/src/cpu/ms_deform_attn_cpu.h
@@ -0,0 +1,33 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
+
diff --git a/models/ops/test.py b/models/ops/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dbf6d5547d131f01a8c5c28b76557bd27a9334b
--- /dev/null
+++ b/models/ops/test.py
@@ -0,0 +1,89 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck
+
+from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
+
+
+N, M, D = 1, 2, 2
+Lq, L, P = 2, 2, 2
+shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
+level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
+S = sum([(H*W).item() for H, W in shapes])
+
+
+torch.manual_seed(3)
+
+
+@torch.no_grad()
+def check_forward_equal_with_pytorch_double():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+
+    print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+
+@torch.no_grad()
+def check_forward_equal_with_pytorch_float():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+
+    print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+
+def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
+
+    value = torch.rand(N, S, M, channels).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    func = MSDeformAttnFunction.apply
+
+    value.requires_grad = grad_value
+    sampling_locations.requires_grad = grad_sampling_loc
+    attention_weights.requires_grad = grad_attn_weight
+
+    gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
+
+    print(f'* {gradok} check_gradient_numerical(D={channels})')
+
+
+if __name__ == '__main__':
+    check_forward_equal_with_pytorch_double()
+    check_forward_equal_with_pytorch_float()
+
+    for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
+        check_gradient_numerical(channels, True, True, True)
+
+
+
diff --git a/models/referformer.py b/models/referformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2f57096b288ece0f53afc7d31dc5dc465cdebfb
--- /dev/null
+++ b/models/referformer.py
@@ -0,0 +1,639 @@
+"""
+ReferFormer model class.
+Modified from DETR (https://github.com/facebookresearch/detr)
+"""
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+import os
+import math
+from util import box_ops
+from util.misc import (NestedTensor, nested_tensor_from_tensor_list,
+                       nested_tensor_from_videos_list,
+                       accuracy, get_world_size, interpolate,
+                       is_dist_avail_and_initialized, inverse_sigmoid)
+
+from .position_encoding import PositionEmbeddingSine1D
+from .backbone import build_backbone
+from .deformable_transformer import build_deforamble_transformer
+from .segmentation import CrossModalFPNDecoder, VisionLanguageFusionModule
+from .matcher import build_matcher
+from .criterion import SetCriterion
+from .postprocessors import build_postprocessors
+
+from transformers import BertTokenizer, BertModel, RobertaModel, RobertaTokenizerFast
+
+import copy
+from einops import rearrange, repeat
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # this disables a huggingface tokenizer warning (printed every epoch)
+
+class ReferFormer(nn.Module):
+    """ This is the ReferFormer module that performs referring video object detection """
+    def __init__(self, backbone, transformer, num_classes, num_queries, num_feature_levels, 
+                    num_frames, mask_dim, dim_feedforward,
+                    controller_layers, dynamic_mask_channels, 
+                    aux_loss=False, with_box_refine=False, two_stage=False, 
+                    freeze_text_encoder=False, rel_coord=True):
+        """ Initializes the model.
+        Parameters:
+            backbone: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            num_classes: number of object classes
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         ReferFormer can detect in a video. For ytvos, we recommend 5 queries for each frame.
+            num_frames:  number of clip frames
+            mask_dim: dynamic conv inter layer channel number.
+            dim_feedforward: vision-language fusion module ffn channel number.
+            dynamic_mask_channels: the mask feature output channel number.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+        """
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        hidden_dim = transformer.d_model
+        self.hidden_dim = hidden_dim
+        self.class_embed = nn.Linear(hidden_dim, num_classes)
+        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        self.num_feature_levels = num_feature_levels
+        
+        # Build Transformer
+        # NOTE: different deformable detr, the query_embed out channels is
+        # hidden_dim instead of hidden_dim * 2
+        # This is because, the input to the decoder is text embedding feature
+        self.query_embed = nn.Embedding(num_queries, hidden_dim) 
+        
+        # follow deformable-detr, we use the last three stages of backbone
+        if num_feature_levels > 1:
+            num_backbone_outs = len(backbone.strides[-3:])
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.num_channels[-3:][_]
+                input_proj_list.append(nn.Sequential(
+                    nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim),
+                ))
+            for _ in range(num_feature_levels - num_backbone_outs): # downsample 2x
+                input_proj_list.append(nn.Sequential(
+                    nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
+                    nn.GroupNorm(32, hidden_dim),
+                ))
+                in_channels = hidden_dim
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj = nn.ModuleList([
+                nn.Sequential(
+                    nn.Conv2d(backbone.num_channels[-3:][0], hidden_dim, kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim),
+                )])
+
+        self.num_frames = num_frames
+        self.mask_dim = mask_dim
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+        self.with_box_refine = with_box_refine
+        assert two_stage == False, "args.two_stage must be false!"
+
+        # initialization
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.class_embed.bias.data = torch.ones(num_classes) * bias_value
+        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+
+        num_pred = transformer.decoder.num_layers
+        if with_box_refine:
+            self.class_embed = _get_clones(self.class_embed, num_pred)
+            self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
+            nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
+            # hack implementation for iterative bounding box refinement
+            self.transformer.decoder.bbox_embed = self.bbox_embed
+        else:
+            nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
+            self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
+            self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
+            self.transformer.decoder.bbox_embed = None
+
+        # Build Text Encoder
+        # self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        # self.text_encoder = BertModel.from_pretrained('bert-base-cased')
+        self.tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
+        self.text_encoder = RobertaModel.from_pretrained('roberta-base')
+
+        if freeze_text_encoder:
+            for p in self.text_encoder.parameters():
+                p.requires_grad_(False)
+        
+        # resize the bert output channel to transformer d_model
+        self.resizer = FeatureResizer(
+            input_feat_size=768,
+            output_feat_size=hidden_dim,
+            dropout=0.1,
+        )
+
+        self.fusion_module = VisionLanguageFusionModule(d_model=hidden_dim, nhead=8)
+        self.text_pos = PositionEmbeddingSine1D(hidden_dim, normalize=True)
+
+        # Build FPN Decoder
+        self.rel_coord = rel_coord
+        feature_channels = [self.backbone.num_channels[0]] + 3 * [hidden_dim]
+        self.pixel_decoder = CrossModalFPNDecoder(feature_channels=feature_channels, conv_dim=hidden_dim, 
+                                                  mask_dim=mask_dim, dim_feedforward=dim_feedforward, norm="GN")
+
+        # Build Dynamic Conv
+        self.controller_layers = controller_layers 
+        self.in_channels = mask_dim
+        self.dynamic_mask_channels = dynamic_mask_channels
+        self.mask_out_stride = 4
+        self.mask_feat_stride = 4
+
+        weight_nums, bias_nums = [], []
+        for l in range(self.controller_layers):
+            if l == 0:
+                if self.rel_coord:
+                    weight_nums.append((self.in_channels + 2) * self.dynamic_mask_channels)
+                else:
+                    weight_nums.append(self.in_channels * self.dynamic_mask_channels)
+                bias_nums.append(self.dynamic_mask_channels)
+            elif l == self.controller_layers - 1:
+                weight_nums.append(self.dynamic_mask_channels * 1) # output layer c -> 1
+                bias_nums.append(1)
+            else:
+                weight_nums.append(self.dynamic_mask_channels * self.dynamic_mask_channels)
+                bias_nums.append(self.dynamic_mask_channels)
+
+        self.weight_nums = weight_nums
+        self.bias_nums = bias_nums
+        self.num_gen_params = sum(weight_nums) + sum(bias_nums)
+
+        self.controller = MLP(hidden_dim, hidden_dim, self.num_gen_params, 3)
+        for layer in self.controller.layers:
+            nn.init.zeros_(layer.bias)
+            nn.init.xavier_uniform_(layer.weight)   
+        
+
+    def forward(self, samples: NestedTensor, captions, targets):
+        """ The forward expects a NestedTensor, which consists of:
+               - samples.tensors: image sequences, of shape [num_frames x 3 x H x W]
+               - samples.mask: a binary mask of shape [num_frames x H x W], containing 1 on padded pixels
+               - captions: list[str]
+               - targets:  list[dict]
+
+            It returns a dict with the following elements:
+               - "pred_masks": Shape = [batch_size x num_queries x out_h x out_w]
+
+               - "pred_logits": the classification logits (including no-object) for all queries.
+                                Shape= [batch_size x num_queries x num_classes]
+               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                               (center_x, center_y, height, width). These values are normalized in [0, 1],
+                               relative to the size of each individual image (disregarding possible padding).
+                               See PostProcess for information on how to retrieve the unnormalized bounding box.
+               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                                dictionnaries containing the two above keys for each decoder layer.
+        """
+        # Backbone
+        if not isinstance(samples, NestedTensor):
+            samples = nested_tensor_from_videos_list(samples) 
+
+        # features (list[NestedTensor]): res2 -> res5, shape of tensors is [B*T, Ci, Hi, Wi]
+        # pos (list[Tensor]): shape of [B*T, C, Hi, Wi]
+        features, pos = self.backbone(samples) 
+
+        b = len(captions)
+        t = pos[0].shape[0] // b
+
+        # For A2D-Sentences and JHMDB-Sentencs dataset, only one frame is annotated for a clip
+        if 'valid_indices' in targets[0]:
+            valid_indices = torch.tensor([i * t + target['valid_indices'] for i, target in enumerate(targets)]).to(pos[0].device)
+            for feature in features:
+                feature.tensors = feature.tensors.index_select(0, valid_indices)
+                feature.mask = feature.mask.index_select(0, valid_indices)
+            for i, p in enumerate(pos):
+                pos[i] = p.index_select(0, valid_indices)
+            samples.mask = samples.mask.index_select(0, valid_indices)
+            # t: num_frames -> 1
+            t = 1
+
+        text_features, text_sentence_features = self.forward_text(captions, device=pos[0].device)
+
+        # prepare vision and text features for transformer
+        srcs = []
+        masks = []
+        poses = []
+
+        text_pos = self.text_pos(text_features).permute(2, 0, 1)  # [length, batch_size, c]
+        text_word_features, text_word_masks = text_features.decompose() 
+        text_word_features = text_word_features.permute(1, 0, 2)  # [length, batch_size, c]
+        
+        # Follow Deformable-DETR, we use the last three stages outputs from backbone
+        for l, (feat, pos_l) in enumerate(zip(features[-3:], pos[-3:])): 
+            src, mask = feat.decompose()            
+            src_proj_l = self.input_proj[l](src)    
+            n, c, h, w = src_proj_l.shape
+
+            # vision language early-fusion
+            src_proj_l = rearrange(src_proj_l, '(b t) c h w -> (t h w) b c', b=b, t=t)
+            src_proj_l = self.fusion_module(tgt=src_proj_l,
+                                             memory=text_word_features,
+                                             memory_key_padding_mask=text_word_masks,
+                                             pos=text_pos,
+                                             query_pos=None
+            ) 
+            src_proj_l = rearrange(src_proj_l, '(t h w) b c -> (b t) c h w', t=t, h=h, w=w)
+
+            srcs.append(src_proj_l)
+            masks.append(mask)
+            poses.append(pos_l)
+            assert mask is not None
+        
+        if self.num_feature_levels > (len(features) - 1):
+            _len_srcs = len(features) - 1 # fpn level
+            for l in range(_len_srcs, self.num_feature_levels):
+                if l == _len_srcs:
+                    src = self.input_proj[l](features[-1].tensors)
+                else:
+                    src = self.input_proj[l](srcs[-1])
+                m = samples.mask
+                mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
+                n, c, h, w = src.shape
+
+                # vision language early-fusion
+                src = rearrange(src, '(b t) c h w -> (t h w) b c', b=b, t=t)
+                src = self.fusion_module(tgt=src,
+                                          memory=text_word_features,
+                                          memory_key_padding_mask=text_word_masks,
+                                          pos=text_pos,
+                                          query_pos=None
+                )
+                src = rearrange(src, '(t h w) b c -> (b t) c h w', t=t, h=h, w=w)
+
+                srcs.append(src)
+                masks.append(mask)
+                poses.append(pos_l)
+        
+        # Transformer
+        query_embeds = self.query_embed.weight  # [num_queries, c]
+        text_embed = repeat(text_sentence_features, 'b c -> b t q c', t=t, q=self.num_queries)
+        hs, memory, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact, inter_samples = \
+                                            self.transformer(srcs, text_embed, masks, poses, query_embeds)
+        # hs: [l, batch_size*time, num_queries_per_frame, c]
+        # memory: list[Tensor], shape of tensor is [batch_size*time, c, hi, wi]
+        # init_reference: [batch_size*time, num_queries_per_frame, 2]
+        # inter_references: [l, batch_size*time, num_queries_per_frame, 4]
+        
+        out = {}
+        # prediction
+        outputs_classes = []
+        outputs_coords = []
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.class_embed[lvl](hs[lvl])
+            tmp = self.bbox_embed[lvl](hs[lvl])
+            if reference.shape[-1] == 4:
+                tmp += reference
+            else:
+                assert reference.shape[-1] == 2
+                tmp[..., :2] += reference
+            outputs_coord = tmp.sigmoid() # cxcywh, range in [0,1]
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        outputs_class = torch.stack(outputs_classes)
+        outputs_coord = torch.stack(outputs_coords)
+        # rearrange
+        outputs_class = rearrange(outputs_class, 'l (b t) q k -> l b t q k', b=b, t=t)
+        outputs_coord = rearrange(outputs_coord, 'l (b t) q n -> l b t q n', b=b, t=t)
+        out['pred_logits'] = outputs_class[-1] # [batch_size, time, num_queries_per_frame, num_classes]
+        out['pred_boxes'] = outputs_coord[-1]  # [batch_size, time, num_queries_per_frame, 4]
+
+        # Segmentation
+        mask_features = self.pixel_decoder(features, text_features, pos, memory, nf=t) # [batch_size*time, c, out_h, out_w]
+        mask_features = rearrange(mask_features, '(b t) c h w -> b t c h w', b=b, t=t)
+
+        # dynamic conv
+        outputs_seg_masks = []
+        for lvl in range(hs.shape[0]):
+            dynamic_mask_head_params = self.controller(hs[lvl])   # [batch_size*time, num_queries_per_frame, num_params]
+            dynamic_mask_head_params = rearrange(dynamic_mask_head_params, '(b t) q n -> b (t q) n', b=b, t=t)
+            lvl_references = inter_references[lvl, ..., :2]
+            lvl_references = rearrange(lvl_references, '(b t) q n -> b (t q) n', b=b, t=t)
+            outputs_seg_mask = self.dynamic_mask_with_coords(mask_features, dynamic_mask_head_params, lvl_references, targets)
+            outputs_seg_mask = rearrange(outputs_seg_mask, 'b (t q) h w -> b t q h w', t=t)
+            outputs_seg_masks.append(outputs_seg_mask)
+        out['pred_masks'] = outputs_seg_masks[-1]  # [batch_size, time, num_queries_per_frame, out_h, out_w]
+
+        if self.aux_loss:
+            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord, outputs_seg_masks)
+        
+        if not self.training:
+            # for visualization
+            inter_references = inter_references[-2, :, :, :2]  # [batch_size*time, num_queries_per_frame, 2]
+            inter_references = rearrange(inter_references, '(b t) q n -> b t q n', b=b, t=t) 
+            out['reference_points'] = inter_references  # the reference points of last layer input
+        return out
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord, outputs_seg_masks):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"pred_logits": a, "pred_boxes": b, "pred_masks": c} 
+                for a, b, c in zip(outputs_class[:-1], outputs_coord[:-1], outputs_seg_masks[:-1])]
+
+    def forward_text(self, captions, device):
+        if isinstance(captions[0], str):
+            tokenized = self.tokenizer.batch_encode_plus(captions, padding="longest", return_tensors="pt").to(device)
+            encoded_text = self.text_encoder(**tokenized)
+            # encoded_text.last_hidden_state: [batch_size, length, 768]
+            # encoded_text.pooler_output: [batch_size, 768]
+            text_attention_mask = tokenized.attention_mask.ne(1).bool()
+            # text_attention_mask: [batch_size, length]
+
+            text_features = encoded_text.last_hidden_state 
+            text_features = self.resizer(text_features)    
+            text_masks = text_attention_mask              
+            text_features = NestedTensor(text_features, text_masks) # NestedTensor
+
+            text_sentence_features = encoded_text.pooler_output  
+            text_sentence_features = self.resizer(text_sentence_features)  
+        else:
+            raise ValueError("Please mask sure the caption is a list of string")
+        return text_features, text_sentence_features
+
+    def dynamic_mask_with_coords(self, mask_features, mask_head_params, reference_points, targets):
+        """
+        Add the relative coordinates to the mask_features channel dimension,
+        and perform dynamic mask conv.
+
+        Args:
+            mask_features: [batch_size, time, c, h, w]
+            mask_head_params: [batch_size, time * num_queries_per_frame, num_params]
+            reference_points: [batch_size, time * num_queries_per_frame, 2], cxcy
+            targets (list[dict]): length is batch size
+                we need the key 'size' for computing location.
+        Return:
+            outputs_seg_mask: [batch_size, time * num_queries_per_frame, h, w]
+        """
+        device = mask_features.device
+        b, t, c, h, w = mask_features.shape
+        # this is the total query number in all frames
+        _, num_queries = reference_points.shape[:2]  
+        q = num_queries // t  # num_queries_per_frame
+
+        # prepare reference points in image size (the size is input size to the model)
+        new_reference_points = [] 
+        for i in range(b):
+            img_h, img_w = targets[i]['size']
+            scale_f = torch.stack([img_w, img_h], dim=0) 
+            tmp_reference_points = reference_points[i] * scale_f[None, :] 
+            new_reference_points.append(tmp_reference_points)
+        new_reference_points = torch.stack(new_reference_points, dim=0) 
+        # [batch_size, time * num_queries_per_frame, 2], in image size
+        reference_points = new_reference_points  
+
+        # prepare the mask features
+        if self.rel_coord:
+            reference_points = rearrange(reference_points, 'b (t q) n -> b t q n', t=t, q=q) 
+            locations = compute_locations(h, w, device=device, stride=self.mask_feat_stride) 
+            relative_coords = reference_points.reshape(b, t, q, 1, 1, 2) - \
+                                    locations.reshape(1, 1, 1, h, w, 2) # [batch_size, time, num_queries_per_frame, h, w, 2]
+            relative_coords = relative_coords.permute(0, 1, 2, 5, 3, 4) # [batch_size, time, num_queries_per_frame, 2, h, w]
+
+            # concat features
+            mask_features = repeat(mask_features, 'b t c h w -> b t q c h w', q=q) # [batch_size, time, num_queries_per_frame, c, h, w]
+            mask_features = torch.cat([mask_features, relative_coords], dim=3)
+        else:
+            mask_features = repeat(mask_features, 'b t c h w -> b t q c h w', q=q) # [batch_size, time, num_queries_per_frame, c, h, w]
+        mask_features = mask_features.reshape(1, -1, h, w) 
+
+        # parse dynamic params
+        mask_head_params = mask_head_params.flatten(0, 1) 
+        weights, biases = parse_dynamic_params(
+            mask_head_params, self.dynamic_mask_channels,
+            self.weight_nums, self.bias_nums
+        )
+
+        # dynamic mask conv
+        mask_logits = self.mask_heads_forward(mask_features, weights, biases, mask_head_params.shape[0]) 
+        mask_logits = mask_logits.reshape(-1, 1, h, w)
+
+        # upsample predicted masks
+        assert self.mask_feat_stride >= self.mask_out_stride
+        assert self.mask_feat_stride % self.mask_out_stride == 0
+
+        mask_logits = aligned_bilinear(mask_logits, int(self.mask_feat_stride / self.mask_out_stride))
+        mask_logits = mask_logits.reshape(b, num_queries, mask_logits.shape[-2], mask_logits.shape[-1])
+
+        return mask_logits  # [batch_size, time * num_queries_per_frame, h, w]
+
+    def mask_heads_forward(self, features, weights, biases, num_insts):
+        '''
+        :param features
+        :param weights: [w0, w1, ...]
+        :param bias: [b0, b1, ...]
+        :return:
+        '''
+        assert features.dim() == 4
+        n_layers = len(weights)
+        x = features
+        for i, (w, b) in enumerate(zip(weights, biases)):
+            x = F.conv2d(
+                x, w, bias=b,
+                stride=1, padding=0,
+                groups=num_insts
+            )
+            if i < n_layers - 1:
+                x = F.relu(x)
+        return x
+
+
+def parse_dynamic_params(params, channels, weight_nums, bias_nums):
+    assert params.dim() == 2
+    assert len(weight_nums) == len(bias_nums)
+    assert params.size(1) == sum(weight_nums) + sum(bias_nums)
+
+    num_insts = params.size(0)
+    num_layers = len(weight_nums)
+
+    params_splits = list(torch.split_with_sizes(params, weight_nums + bias_nums, dim=1))
+
+    weight_splits = params_splits[:num_layers]
+    bias_splits = params_splits[num_layers:]
+
+    for l in range(num_layers):
+        if l < num_layers - 1:
+            # out_channels x in_channels x 1 x 1
+            weight_splits[l] = weight_splits[l].reshape(num_insts * channels, -1, 1, 1)
+            bias_splits[l] = bias_splits[l].reshape(num_insts * channels)
+        else:
+            # out_channels x in_channels x 1 x 1
+            weight_splits[l] = weight_splits[l].reshape(num_insts * 1, -1, 1, 1)
+            bias_splits[l] = bias_splits[l].reshape(num_insts)
+
+    return weight_splits, bias_splits
+
+def aligned_bilinear(tensor, factor):
+    assert tensor.dim() == 4 
+    assert factor >= 1
+    assert int(factor) == factor
+
+    if factor == 1:
+        return tensor
+
+    h, w = tensor.size()[2:]
+    tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate")
+    oh = factor * h + 1
+    ow = factor * w + 1
+    tensor = F.interpolate(
+        tensor, size=(oh, ow),
+        mode='bilinear',
+        align_corners=True
+    )
+    tensor = F.pad(
+        tensor, pad=(factor // 2, 0, factor // 2, 0),
+        mode="replicate"
+    )
+
+    return tensor[:, :, :oh - 1, :ow - 1]
+
+
+def compute_locations(h, w, device, stride=1):
+    shifts_x = torch.arange(
+        0, w * stride, step=stride,
+        dtype=torch.float32, device=device)
+
+    shifts_y = torch.arange(
+        0, h * stride, step=stride,
+        dtype=torch.float32, device=device)
+
+    shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+    shift_x = shift_x.reshape(-1)
+    shift_y = shift_y.reshape(-1)
+    locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
+    return locations
+
+
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+class FeatureResizer(nn.Module):
+    """
+    This class takes as input a set of embeddings of dimension C1 and outputs a set of
+    embedding of dimension C2, after a linear transformation, dropout and normalization (LN).
+    """
+
+    def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
+        super().__init__()
+        self.do_ln = do_ln
+        # Object feature encoding
+        self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True)
+        self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, encoder_features):
+        x = self.fc(encoder_features)
+        if self.do_ln:
+            x = self.layer_norm(x)
+        output = self.dropout(x)
+        return output
+
+
+def build(args):
+    if args.binary:
+        num_classes = 1
+    else:
+        if args.dataset_file == 'ytvos':
+            num_classes = 65 
+        elif args.dataset_file == 'davis':
+            num_classes = 78
+        elif args.dataset_file == 'a2d' or args.dataset_file == 'jhmdb':
+            num_classes = 1
+        else: 
+            num_classes = 91 # for coco
+    device = torch.device(args.device)
+
+    # backbone
+    if 'video_swin' in args.backbone:
+        from .video_swin_transformer import build_video_swin_backbone
+        backbone = build_video_swin_backbone(args)
+    elif 'swin' in args.backbone:
+        from .swin_transformer import build_swin_backbone
+        backbone = build_swin_backbone(args) 
+    else:
+        backbone = build_backbone(args)
+
+    transformer = build_deforamble_transformer(args)
+
+    model = ReferFormer(
+        backbone,
+        transformer,
+        num_classes=num_classes,
+        num_queries=args.num_queries,
+        num_feature_levels=args.num_feature_levels,
+        num_frames=args.num_frames,
+        mask_dim=args.mask_dim,
+        dim_feedforward=args.dim_feedforward,
+        controller_layers=args.controller_layers,
+        dynamic_mask_channels=args.dynamic_mask_channels,
+        aux_loss=args.aux_loss,
+        with_box_refine=args.with_box_refine,
+        two_stage=args.two_stage,
+        freeze_text_encoder=args.freeze_text_encoder,
+        rel_coord=args.rel_coord
+    )
+    matcher = build_matcher(args)
+    weight_dict = {}
+    weight_dict['loss_ce'] = args.cls_loss_coef
+    weight_dict['loss_bbox'] = args.bbox_loss_coef
+    weight_dict['loss_giou'] = args.giou_loss_coef
+    if args.masks: # always true
+        weight_dict['loss_mask'] = args.mask_loss_coef
+        weight_dict['loss_dice'] = args.dice_loss_coef
+    # TODO this is a hack
+    if args.aux_loss:
+        aux_weight_dict = {}
+        for i in range(args.dec_layers - 1):
+            aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()})
+        weight_dict.update(aux_weight_dict)
+
+    losses = ['labels', 'boxes']
+    if args.masks:
+        losses += ['masks']
+    criterion = SetCriterion(
+            num_classes, 
+            matcher=matcher,
+            weight_dict=weight_dict, 
+            eos_coef=args.eos_coef, 
+            losses=losses,
+            focal_alpha=args.focal_alpha)
+    criterion.to(device)
+
+    # postprocessors, this is used for coco pretrain but not for rvos
+    postprocessors = build_postprocessors(args, args.dataset_file)
+    return model, criterion, postprocessors
+
+
+
diff --git a/my_datasets/__init__.py b/my_datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..724c8d0616a65de7b215a3c5c5517f9af9c8f84e
--- /dev/null
+++ b/my_datasets/__init__.py
@@ -0,0 +1,40 @@
+import torch.utils.data
+import torchvision
+
+from .ytvos import build as build_ytvos
+from .ytvos_ref import build as build_ytvos_ref
+from .davis import build as build_davis
+from .a2d import build as build_a2d
+from .jhmdb import build as build_jhmdb
+from .refexp import build as build_refexp
+from .concat_dataset import build as build_joint
+
+
+def get_coco_api_from_dataset(dataset):
+    for _ in range(10):
+        # if isinstance(dataset, torchvision.datasets.CocoDetection):
+        #     break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+
+
+def build_dataset(dataset_file: str, image_set: str, args):
+    if dataset_file == 'ytvos':
+        return build_ytvos(image_set, args)
+    if dataset_file == 'ytvos_ref':
+        return build_ytvos_ref(image_set, args)
+    if dataset_file == 'davis':
+        return build_davis(image_set, args)
+    if dataset_file == 'a2d':
+        return build_a2d(image_set, args)
+    if dataset_file == 'jhmdb':
+        return build_jhmdb(image_set, args)
+    # for pretraining
+    if dataset_file == "refcoco" or dataset_file == "refcoco+" or dataset_file == "refcocog":
+        return build_refexp(dataset_file, image_set, args)
+    # for joint training of refcoco and ytvos
+    if dataset_file == 'joint':
+        return build_joint(image_set, args)
+    raise ValueError(f'dataset {dataset_file} not supported')
diff --git a/my_datasets/__pycache__/__init__.cpython-310.pyc b/my_datasets/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb9a6d221b0027d7de99ee3a7b11c335322f51b3
Binary files /dev/null and b/my_datasets/__pycache__/__init__.cpython-310.pyc differ
diff --git a/my_datasets/__pycache__/a2d.cpython-310.pyc b/my_datasets/__pycache__/a2d.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f92920d90dd633ae4b9cc2493f6c584034aef25
Binary files /dev/null and b/my_datasets/__pycache__/a2d.cpython-310.pyc differ
diff --git a/my_datasets/__pycache__/a2d.cpython-39.pyc b/my_datasets/__pycache__/a2d.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15866b20f69694461f5ff478dadcd2778984aaa7
Binary files /dev/null and b/my_datasets/__pycache__/a2d.cpython-39.pyc differ
diff --git a/my_datasets/__pycache__/jhmdb.cpython-39.pyc b/my_datasets/__pycache__/jhmdb.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7599ad5e2c66e8fd1ed994dec584f1ba0e56e616
Binary files /dev/null and b/my_datasets/__pycache__/jhmdb.cpython-39.pyc differ
diff --git a/my_datasets/__pycache__/refexp2seq.cpython-310.pyc b/my_datasets/__pycache__/refexp2seq.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b69155a3c51ee5da7fb774f13fb1836c6491ffb2
Binary files /dev/null and b/my_datasets/__pycache__/refexp2seq.cpython-310.pyc differ
diff --git a/my_datasets/__pycache__/transforms_image.cpython-310.pyc b/my_datasets/__pycache__/transforms_image.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b15347ade71f0e6f7412500a3e9286646097d628
Binary files /dev/null and b/my_datasets/__pycache__/transforms_image.cpython-310.pyc differ
diff --git a/my_datasets/__pycache__/ytvos.cpython-310.pyc b/my_datasets/__pycache__/ytvos.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0973ad22eb6e6b9d156c4cb8098b6aac68929815
Binary files /dev/null and b/my_datasets/__pycache__/ytvos.cpython-310.pyc differ
diff --git a/my_datasets/a2d_eval.py b/my_datasets/a2d_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba7e86b9da2fbc2e2600b740d3a49f448b51ef2
--- /dev/null
+++ b/my_datasets/a2d_eval.py
@@ -0,0 +1,96 @@
+"""
+This file contains implementations for the precision@k and IoU (mean, overall) evaluation metrics.
+copy-paste from https://github.com/mttr2021/MTTR/blob/main/metrics.py
+"""
+import torch
+from tqdm import tqdm
+from pycocotools.coco import COCO
+from pycocotools.mask import decode
+import numpy as np
+
+from torchvision.ops.boxes import box_area
+
+def compute_bbox_iou(boxes1: torch.Tensor, boxes2: torch.Tensor):
+    # both boxes: xyxy
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = (inter+1e-6) / (union+1e-6)
+    return iou, inter, union
+
+def compute_mask_iou(outputs: torch.Tensor, labels: torch.Tensor, EPS=1e-6):
+    outputs = outputs.int()
+    intersection = (outputs & labels).float().sum((1, 2))  # Will be zero if Truth=0 or Prediction=0
+    union = (outputs | labels).float().sum((1, 2))  # Will be zero if both are 0
+    iou = (intersection + EPS) / (union + EPS)  # EPS is used to avoid division by zero
+    return iou, intersection, union
+
+# mask
+def calculate_precision_at_k_and_iou_metrics(coco_gt: COCO, coco_pred: COCO):
+    print('evaluating mask precision@k & iou metrics...')
+    counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]}
+    total_intersection_area = 0
+    total_union_area = 0
+    ious_list = []
+    for instance in tqdm(coco_gt.imgs.keys()):  # each image_id contains exactly one instance
+        gt_annot = coco_gt.imgToAnns[instance][0]
+        gt_mask = decode(gt_annot['segmentation'])
+        pred_annots = coco_pred.imgToAnns[instance]
+        pred_annot = sorted(pred_annots, key=lambda a: a['score'])[-1]  # choose pred with highest score
+        pred_mask = decode(pred_annot['segmentation'])
+        iou, intersection, union = compute_mask_iou(torch.tensor(pred_mask).unsqueeze(0),
+                                               torch.tensor(gt_mask).unsqueeze(0))
+        iou, intersection, union = iou.item(), intersection.item(), union.item()
+        for iou_threshold in counters_by_iou.keys():
+            if iou > iou_threshold:
+                counters_by_iou[iou_threshold] += 1
+        total_intersection_area += intersection
+        total_union_area += union
+        ious_list.append(iou)
+    num_samples = len(ious_list)
+    precision_at_k = np.array(list(counters_by_iou.values())) / num_samples
+    overall_iou = total_intersection_area / total_union_area
+    mean_iou = np.mean(ious_list)
+    return precision_at_k, overall_iou, mean_iou
+
+# bbox
+def calculate_bbox_precision_at_k_and_iou_metrics(coco_gt: COCO, coco_pred: COCO):
+    print('evaluating bbox precision@k & iou metrics...')
+    counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]}
+    total_intersection_area = 0
+    total_union_area = 0
+    ious_list = []
+    for instance in tqdm(coco_gt.imgs.keys()):  # each image_id contains exactly one instance
+        gt_annot = coco_gt.imgToAnns[instance][0]
+        gt_bbox = gt_annot['bbox'] # xywh
+        gt_bbox = [
+            gt_bbox[0],
+            gt_bbox[1],
+            gt_bbox[2] + gt_bbox[0],
+            gt_bbox[3] + gt_bbox[1],
+        ]
+        pred_annots = coco_pred.imgToAnns[instance]
+        pred_annot = sorted(pred_annots, key=lambda a: a['score'])[-1]  # choose pred with highest score
+        pred_bbox = pred_annot['bbox']  # xyxy
+        iou, intersection, union = compute_bbox_iou(torch.tensor(pred_bbox).unsqueeze(0),
+                                               torch.tensor(gt_bbox).unsqueeze(0))
+        iou, intersection, union = iou.item(), intersection.item(), union.item()
+        for iou_threshold in counters_by_iou.keys():
+            if iou > iou_threshold:
+                counters_by_iou[iou_threshold] += 1
+        total_intersection_area += intersection
+        total_union_area += union
+        ious_list.append(iou)
+    num_samples = len(ious_list)
+    precision_at_k = np.array(list(counters_by_iou.values())) / num_samples
+    overall_iou = total_intersection_area / total_union_area
+    mean_iou = np.mean(ious_list)
+    return precision_at_k, overall_iou, mean_iou
diff --git a/my_datasets/categories.py b/my_datasets/categories.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2cf7030bd86c40f3c7807f5712689acbfb7ded0
--- /dev/null
+++ b/my_datasets/categories.py
@@ -0,0 +1,54 @@
+# -------------------------------------------------------------------------------------------------------------------
+# 1. Ref-Youtube-VOS
+ytvos_category_dict = {
+    'airplane': 0, 'ape': 1, 'bear': 2, 'bike': 3, 'bird': 4, 'boat': 5, 'bucket': 6, 'bus': 7, 'camel': 8, 'cat': 9, 
+    'cow': 10, 'crocodile': 11, 'deer': 12, 'dog': 13, 'dolphin': 14, 'duck': 15, 'eagle': 16, 'earless_seal': 17, 
+    'elephant': 18, 'fish': 19, 'fox': 20, 'frisbee': 21, 'frog': 22, 'giant_panda': 23, 'giraffe': 24, 'hand': 25, 
+    'hat': 26, 'hedgehog': 27, 'horse': 28, 'knife': 29, 'leopard': 30, 'lion': 31, 'lizard': 32, 'monkey': 33, 
+    'motorbike': 34, 'mouse': 35, 'others': 36, 'owl': 37, 'paddle': 38, 'parachute': 39, 'parrot': 40, 'penguin': 41, 
+    'person': 42, 'plant': 43, 'rabbit': 44, 'raccoon': 45, 'sedan': 46, 'shark': 47, 'sheep': 48, 'sign': 49, 
+    'skateboard': 50, 'snail': 51, 'snake': 52, 'snowboard': 53, 'squirrel': 54, 'surfboard': 55, 'tennis_racket': 56, 
+    'tiger': 57, 'toilet': 58, 'train': 59, 'truck': 60, 'turtle': 61, 'umbrella': 62, 'whale': 63, 'zebra': 64
+}
+
+ytvos_category_list = [
+    'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bucket', 'bus', 'camel', 'cat', 'cow', 'crocodile', 
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frisbee', 'frog', 
+    'giant_panda', 'giraffe', 'hand', 'hat', 'hedgehog', 'horse', 'knife', 'leopard', 'lion', 'lizard', 
+    'monkey', 'motorbike', 'mouse', 'others', 'owl', 'paddle', 'parachute', 'parrot', 'penguin', 'person', 
+    'plant', 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'sign', 'skateboard', 'snail', 'snake', 'snowboard', 
+    'squirrel', 'surfboard', 'tennis_racket', 'tiger', 'toilet', 'train', 'truck', 'turtle', 'umbrella', 'whale', 'zebra'
+]
+
+ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', 
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', 
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', 
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', 
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',  
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+]
+
+# -------------------------------------------------------------------------------------------------------------------
+# 2. Ref-DAVIS17
+davis_category_dict = {
+    'airplane': 0, 'backpack': 1, 'ball': 2, 'bear': 3, 'bicycle': 4, 'bird': 5, 'boat': 6, 'bottle': 7, 'box': 8, 'bus': 9, 
+    'camel': 10, 'car': 11, 'carriage': 12, 'cat': 13, 'cellphone': 14, 'chamaleon': 15, 'cow': 16, 'deer': 17, 'dog': 18, 
+    'dolphin': 19, 'drone': 20, 'elephant': 21, 'excavator': 22, 'fish': 23, 'goat': 24, 'golf cart': 25, 'golf club': 26, 
+    'grass': 27, 'guitar': 28, 'gun': 29, 'helicopter': 30, 'horse': 31, 'hoverboard': 32, 'kart': 33, 'key': 34, 'kite': 35, 
+    'koala': 36, 'leash': 37, 'lion': 38, 'lock': 39, 'mask': 40, 'microphone': 41, 'monkey': 42, 'motorcycle': 43, 'oar': 44, 
+    'paper': 45, 'paraglide': 46, 'person': 47, 'pig': 48, 'pole': 49, 'potted plant': 50, 'puck': 51, 'rack': 52, 'rhino': 53, 
+    'rope': 54, 'sail': 55, 'scale': 56, 'scooter': 57, 'selfie stick': 58, 'sheep': 59, 'skateboard': 60, 'ski': 61, 'ski poles': 62, 
+    'snake': 63, 'snowboard': 64, 'stick': 65, 'stroller': 66, 'surfboard': 67, 'swing': 68, 'tennis racket': 69, 'tractor': 70, 
+    'trailer': 71, 'train': 72, 'truck': 73, 'turtle': 74, 'varanus': 75, 'violin': 76, 'wheelchair': 77
+}
+
+davis_category_list = [
+    'airplane', 'backpack', 'ball', 'bear', 'bicycle', 'bird', 'boat', 'bottle', 'box', 'bus', 'camel', 'car', 'carriage', 
+    'cat', 'cellphone', 'chamaleon', 'cow', 'deer', 'dog', 'dolphin', 'drone', 'elephant', 'excavator', 'fish', 'goat', 
+    'golf cart', 'golf club', 'grass', 'guitar', 'gun', 'helicopter', 'horse', 'hoverboard', 'kart', 'key', 'kite', 'koala', 
+    'leash', 'lion', 'lock', 'mask', 'microphone', 'monkey', 'motorcycle', 'oar', 'paper', 'paraglide', 'person', 'pig', 
+    'pole', 'potted plant', 'puck', 'rack', 'rhino', 'rope', 'sail', 'scale', 'scooter', 'selfie stick', 'sheep', 'skateboard', 
+    'ski', 'ski poles', 'snake', 'snowboard', 'stick', 'stroller', 'surfboard', 'swing', 'tennis racket', 'tractor', 'trailer', 
+    'train', 'truck', 'turtle', 'varanus', 'violin', 'wheelchair'
+]
\ No newline at end of file
diff --git a/my_datasets/coco.py b/my_datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d05d260c71ec942b5d3053a47823ed9fe03e3314
--- /dev/null
+++ b/my_datasets/coco.py
@@ -0,0 +1,157 @@
+"""
+COCO dataset which returns image_id for evaluation.
+
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+"""
+from pathlib import Path
+
+import torch
+import torch.utils.data
+import torchvision
+from pycocotools import mask as coco_mask
+
+import datasets.transforms as T
+
+
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms, return_masks):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+        self.prepare = ConvertCocoPolysToMask(return_masks)
+
+    def __getitem__(self, idx):
+        img, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        target = {'image_id': image_id, 'annotations': target}
+
+        img, target = self.prepare(img, target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        return img, target
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask(object):
+    def __init__(self, return_masks=False):
+        self.return_masks = return_masks
+
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        if self.return_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if self.return_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        if self.return_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+
+        target["orig_size"] = torch.as_tensor([int(h), int(w)])
+        target["size"] = torch.as_tensor([int(h), int(w)])
+
+        return image, target
+
+
+def make_coco_transforms(image_set):
+
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.RandomSelect(
+                T.RandomResize(scales, max_size=1333),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=1333),
+                ])
+            ),
+            normalize,
+        ])
+
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([800], max_size=1333),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.coco_path)
+    assert root.exists(), f'provided COCO path {root} does not exist'
+    mode = 'instances'
+    PATHS = {
+        "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'),
+        "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'),
+    }
+    img_folder, ann_file = PATHS[image_set]
+    dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks)
+    return dataset
diff --git a/my_datasets/refexp.py b/my_datasets/refexp.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee4cac86e0bedb8d1ef3e2e1aea3239715856b6d
--- /dev/null
+++ b/my_datasets/refexp.py
@@ -0,0 +1,179 @@
+# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+COCO dataset which returns image_id for evaluation.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+"""
+from pathlib import Path
+
+import torch
+import torch.utils.data
+import torchvision
+from pycocotools import mask as coco_mask
+
+import datasets.transforms_image as T
+
+
+class ModulatedDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms, return_masks):
+        super(ModulatedDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+        self.prepare = ConvertCocoPolysToMask(return_masks)
+
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            img, target = super(ModulatedDetection, self).__getitem__(idx)
+            image_id = self.ids[idx]
+            coco_img = self.coco.loadImgs(image_id)[0]
+            caption = coco_img["caption"]
+            dataset_name = coco_img["dataset_name"] if "dataset_name" in coco_img else None
+            target = {"image_id": image_id, "annotations": target, "caption": caption}
+            img, target = self.prepare(img, target)
+            if self._transforms is not None:
+                img, target = self._transforms(img, target)
+            target["dataset_name"] = dataset_name
+            for extra_key in ["sentence_id", "original_img_id", "original_id", "task_id"]:
+                if extra_key in coco_img:
+                    target[extra_key] = coco_img[extra_key] # box xyxy -> cxcywh
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            target["valid"] = torch.tensor([1]) if len(target["area"]) != 0 else torch.tensor([0])
+
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                import random
+                idx = random.randint(0, self.__len__() - 1)
+        return img.unsqueeze(0), target
+        # return img: [1, 3, H, W], the first dimension means T = 1.
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask(object):
+    def __init__(self, return_masks=False):
+        self.return_masks = return_masks
+
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+
+        anno = target["annotations"]
+        caption = target["caption"] if "caption" in target else None
+
+        anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2] # xminyminwh -> xyxy
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        if self.return_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        # keep the valid boxes
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if self.return_masks:
+            masks = masks[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        if caption is not None:
+            target["caption"] = caption
+        if self.return_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+        target["valid"] = torch.tensor([1])
+        target["orig_size"] = torch.as_tensor([int(h), int(w)])
+        target["size"] = torch.as_tensor([int(h), int(w)])
+        return image, target
+
+
+def make_coco_transforms(image_set, cautious):
+
+    normalize = T.Compose([T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
+
+    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
+    final_scales = [296, 328, 360, 392, 416, 448, 480, 512] 
+
+    max_size = 800
+    if image_set == "train":
+        horizontal = [] if cautious else [T.RandomHorizontalFlip()]
+        return T.Compose(
+            horizontal
+            + [
+                T.RandomSelect(
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Compose(
+                        [
+                            T.RandomResize([400, 500, 600]),
+                            T.RandomSizeCrop(384, 600, respect_boxes=cautious),
+                            T.RandomResize(final_scales, max_size=640),
+                        ]
+                    ),
+                ),
+                normalize,
+            ]
+        )
+
+    if image_set == "val":
+        return T.Compose(
+            [
+                T.RandomResize([360], max_size=640),
+                normalize,
+            ]
+        )
+
+    raise ValueError(f"unknown {image_set}")
+
+
+def build(dataset_file, image_set, args):
+    root = Path(args.coco_path)
+    assert root.exists(), f"provided COCO path {root} does not exist"
+    mode = "instances"
+    dataset = dataset_file
+    PATHS = {
+        "train": (root / "train2014", root / dataset / f"{mode}_{dataset}_train.json"),
+        "val": (root / "train2014", root / dataset / f"{mode}_{dataset}_val.json"),
+    }
+
+    img_folder, ann_file = PATHS[image_set]
+    dataset = ModulatedDetection(
+        img_folder,
+        ann_file,
+        transforms=make_coco_transforms(image_set, False),
+        return_masks=args.masks,
+    )
+    return dataset
\ No newline at end of file
diff --git a/my_datasets/refexp2seq.py b/my_datasets/refexp2seq.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bc50f1850f2992b8749f41b3c7b9de93250371d
--- /dev/null
+++ b/my_datasets/refexp2seq.py
@@ -0,0 +1,229 @@
+# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# For building refcoco, refcoco+, refcocog datasets
+"""
+COCO dataset which returns image_id for evaluation.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+"""
+from pathlib import Path
+
+import torch
+import torch.utils.data
+import torchvision
+from pycocotools import mask as coco_mask
+
+import random
+import numpy as np
+from PIL import Image
+
+import datasets.transforms_video as T
+from datasets.image_to_seq_augmenter import ImageToSeqAugmenter
+
+from util.box_ops import masks_to_boxes
+
+
+class ModulatedDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, num_frames, transforms, return_masks):
+        super(ModulatedDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+        self.prepare = ConvertCocoPolysToMask(return_masks)
+        self.num_frames = num_frames
+        self.augmenter = ImageToSeqAugmenter(perspective=True, affine=True, motion_blur=True,
+                                             rotation_range=(-20, 20), perspective_magnitude=0.08,
+                                             hue_saturation_range=(-5, 5), brightness_range=(-40, 40),
+                                             motion_blur_prob=0.25, motion_blur_kernel_sizes=(9, 11),
+                                             translate_range=(-0.1, 0.1))
+
+    def apply_random_sequence_shuffle(self, images, instance_masks):
+        perm = list(range(self.num_frames))
+        random.shuffle(perm)
+        images = [images[i] for i in perm]
+        instance_masks = [instance_masks[i] for i in perm]
+        return images, instance_masks
+
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            img, target = super(ModulatedDetection, self).__getitem__(idx)
+            image_id = self.ids[idx]
+            coco_img = self.coco.loadImgs(image_id)[0]
+            caption = coco_img["caption"]
+            dataset_name = coco_img["dataset_name"] if "dataset_name" in coco_img else None
+            target = {"image_id": image_id, "annotations": target, "caption": caption}
+            img, target = self.prepare(img, target)
+
+            # for a image, we rotate it to form a clip
+            seq_images, seq_instance_masks = [img], [target['masks'].numpy()]
+            numpy_masks = target['masks'].numpy() # [1, H, W]
+
+            numinst = len(numpy_masks)
+            assert numinst == 1
+            for t in range(self.num_frames - 1):
+                im_trafo, instance_masks_trafo = self.augmenter(np.asarray(img), numpy_masks)
+                im_trafo = Image.fromarray(np.uint8(im_trafo))
+                seq_images.append(im_trafo)
+                seq_instance_masks.append(np.stack(instance_masks_trafo, axis=0))
+            seq_images, seq_instance_masks = self.apply_random_sequence_shuffle(seq_images, seq_instance_masks)
+            output_inst_masks = []
+            for inst_i  in range(numinst):
+                inst_i_mask = []
+                for f_i in range(self.num_frames):
+                    inst_i_mask.append(seq_instance_masks[f_i][inst_i])
+                output_inst_masks.append( np.stack(inst_i_mask, axis=0) )
+            
+            output_inst_masks = torch.from_numpy( np.stack(output_inst_masks, axis=0) )         
+            target['masks'] = output_inst_masks.flatten(0,1)            # [t, h, w]
+            target['boxes'] = masks_to_boxes(target['masks'])           # [t, 4]
+            target['labels'] = target['labels'].repeat(self.num_frames) # [t,]
+
+            if self._transforms is not None:
+                img, target = self._transforms(seq_images, target)
+            target["dataset_name"] = dataset_name
+            for extra_key in ["sentence_id", "original_img_id", "original_id", "task_id"]:
+                if extra_key in coco_img:
+                    target[extra_key] = coco_img[extra_key] # box xyxy -> cxcywh
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+
+        # set the gt box of empty mask to [0, 0, 0, 0]
+        for inst_id in range(len(target['boxes'])):
+            if target['masks'][inst_id].max()<1:
+                target['boxes'][inst_id] =  torch.zeros(4).to(target['boxes'][inst_id]) 
+
+        target['boxes']=target['boxes'].clamp(1e-6)
+        return torch.stack(img,dim=0), target
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask(object):
+    def __init__(self, return_masks=False):
+        self.return_masks = return_masks
+
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+
+        anno = target["annotations"]
+        caption = target["caption"] if "caption" in target else None
+
+        anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2] # xminyminwh -> xyxy
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        if self.return_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        # keep the valid boxes
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if self.return_masks:
+            masks = masks[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        if caption is not None:
+            target["caption"] = caption
+        if self.return_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+        target["valid"] = torch.tensor([1])
+        target["orig_size"] = torch.as_tensor([int(h), int(w)])
+        target["size"] = torch.as_tensor([int(h), int(w)])
+        return image, target
+
+
+def make_coco_transforms(image_set, max_size):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+
+    if image_set == "val":
+        return T.Compose(
+            [
+                T.RandomResize([360], max_size=640),
+                normalize,
+            ]
+        )
+
+    raise ValueError(f"unknown {image_set}")
+
+
+def build(dataset_file, image_set, args):
+    root = Path(args.coco_path)
+    assert root.exists(), f"provided COCO path {root} does not exist"
+    mode = "instances"
+    dataset = dataset_file
+    PATHS = {
+        "train": (root / "train2014", root / dataset / f"{mode}_{dataset}_train.json"),
+        "val": (root / "train2014", root / dataset / f"{mode}_{dataset}_val.json"),
+    }
+
+    img_folder, ann_file = PATHS[image_set]
+    dataset = ModulatedDetection(
+        img_folder,
+        ann_file,
+        num_frames=args.num_frames,
+        transforms=make_coco_transforms(image_set, args.max_size),
+        return_masks=args.masks,
+    )
+    return dataset
diff --git a/my_datasets/transforms_image.py b/my_datasets/transforms_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..eff840cdd0f3dc43b4679e654c42d16090cffb30
--- /dev/null
+++ b/my_datasets/transforms_image.py
@@ -0,0 +1,304 @@
+# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Transforms and data augmentation for both image + bbox.
+"""
+import random
+
+import PIL
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+
+from util.box_ops import box_xyxy_to_cxcywh
+from util.misc import interpolate
+
+
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+
+    target = target.copy()
+    i, j, h, w = region
+
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+
+    fields = ["labels", "area", "iscrowd", "positive_map", "isfinal"]
+
+    if "boxes" in target:
+        boxes = target["boxes"]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes?
+        target["masks"] = target["masks"][:, i : i + h, j : j + w]
+        fields.append("masks")
+
+    # remove elements for which the boxes or masks that have zero area
+    if "boxes" in target or "masks" in target:
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if "boxes" in target:
+            cropped_boxes = target["boxes"].reshape(-1, 2, 2)
+            keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target["masks"].flatten(1).any(1)
+
+        for field in fields:
+            if field in target:
+                target[field] = target[field][keep]
+
+    return cropped_image, target
+
+
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+
+    w, h = image.size
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+        target["boxes"] = boxes
+
+    if "masks" in target:
+        target["masks"] = target["masks"].flip(-1)
+
+    if "caption" in target:
+        caption = target["caption"].replace("left", "[TMP]").replace("right", "left").replace("[TMP]", "right")
+        target["caption"] = caption
+
+    return flipped_image, target
+
+
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        return (oh, ow)
+
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+
+    if target is None:
+        return rescaled_image, None
+
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        target["boxes"] = scaled_boxes
+
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+
+    if "masks" in target:
+        target["masks"] = interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+
+    return rescaled_image, target
+
+
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image[::-1])
+    if "masks" in target:
+        target["masks"] = torch.nn.functional.pad(target["masks"], (0, padding[0], 0, padding[1]))
+    return padded_image, target
+
+
+class RandomCrop(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        region = T.RandomCrop.get_params(img, self.size)
+        return crop(img, target, region)
+
+
+class RandomSizeCrop(object):
+    def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False):
+        self.min_size = min_size
+        self.max_size = max_size
+        self.respect_boxes = respect_boxes  # if True we can't crop a box out
+
+    def __call__(self, img: PIL.Image.Image, target: dict):
+        init_boxes = len(target["boxes"])
+        max_patience = 100
+        for i in range(max_patience):
+            w = random.randint(self.min_size, min(img.width, self.max_size))
+            h = random.randint(self.min_size, min(img.height, self.max_size))
+            region = T.RandomCrop.get_params(img, [h, w])
+            result_img, result_target = crop(img, target, region)
+            if not self.respect_boxes or len(result_target["boxes"]) == init_boxes or i == max_patience - 1:
+                return result_img, result_target
+        return result_img, result_target
+
+
+class CenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.0))
+        crop_left = int(round((image_width - crop_width) / 2.0))
+        return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
+
+
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+
+
+class RandomResize(object):
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        return resize(img, target, size, self.max_size)
+
+
+class RandomPad(object):
+    def __init__(self, max_pad):
+        self.max_pad = max_pad
+
+    def __call__(self, img, target):
+        pad_x = random.randint(0, self.max_pad)
+        pad_y = random.randint(0, self.max_pad)
+        return pad(img, target, (pad_x, pad_y))
+
+
+class RandomSelect(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    with probability p for transforms1 and (1 - p) for transforms2
+    """
+
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return self.transforms1(img, target)
+        return self.transforms2(img, target)
+
+
+class ToTensor(object):
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+
+
+class RandomErasing(object):
+    def __init__(self, *args, **kwargs):
+        self.eraser = T.RandomErasing(*args, **kwargs)
+
+    def __call__(self, img, target):
+        return self.eraser(img), target
+
+
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image.shape[-2:]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            target["boxes"] = boxes
+        return image, target
+
+
+class RemoveDifficult(object):
+    def __init__(self, enabled=False):
+        self.remove_difficult = enabled
+
+    def __call__(self, image, target=None):
+        if target is None:
+            return image, None
+        target = target.copy()
+        keep = ~target["iscrowd"].to(torch.bool) | (not self.remove_difficult)
+        if "boxes" in target:
+            target["boxes"] = target["boxes"][keep]
+        target["labels"] = target["labels"][keep]
+        target["iscrowd"] = target["iscrowd"][keep]
+        return image, target
+
+
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
diff --git a/my_datasets/transforms_video.py b/my_datasets/transforms_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2145e9089185af92479328b878f158292d38d02
--- /dev/null
+++ b/my_datasets/transforms_video.py
@@ -0,0 +1,565 @@
+"""
+Transforms and data augmentation for sequence level images, bboxes and masks.
+"""
+import random
+
+import PIL
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+
+from util.box_ops import box_xyxy_to_cxcywh, box_iou
+from util.misc import interpolate
+import numpy as np
+from numpy import random as rand
+from PIL import Image
+import cv2
+
+
+
+class Check(object):
+    def __init__(self,):
+        pass
+    def __call__(self,  img, target):
+        fields = ["labels"]  
+        if "boxes" in target:
+            fields.append("boxes")
+        if "masks" in target:
+            fields.append("masks")
+
+        ### check if box or mask still exist after transforms
+        if "boxes" in target or "masks" in target:
+            if "boxes" in target:
+                cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+                keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+            else:
+                keep = target['masks'].flatten(1).any(1)
+
+            if False in keep:
+                for k in range(len(keep)):
+                    if not keep[k] and "boxes" in target:
+                        target['boxes'][k] = target['boxes'][k]//1000.0  # [0, 0, 0, 0]
+            
+        target['valid'] = keep.to(torch.int32)
+
+        return  img, target
+
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', eps=1e-6):
+    assert mode in ['iou', 'iof']
+    bboxes1 = bboxes1.astype(np.float32)
+    bboxes2 = bboxes2.astype(np.float32)
+    rows = bboxes1.shape[0]
+    cols = bboxes2.shape[0]
+    ious = np.zeros((rows, cols), dtype=np.float32)
+    if rows * cols == 0:
+        return ious
+    exchange = False
+    if bboxes1.shape[0] > bboxes2.shape[0]:
+        bboxes1, bboxes2 = bboxes2, bboxes1
+        ious = np.zeros((cols, rows), dtype=np.float32)
+        exchange = True
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
+    for i in range(bboxes1.shape[0]):
+        x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
+        y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
+        x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
+        y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
+        overlap = np.maximum(x_end - x_start, 0) * np.maximum(y_end - y_start, 0)
+        if mode == 'iou':
+            union = area1[i] + area2 - overlap
+        else:
+            union = area1[i] if not exchange else area2
+        union = np.maximum(union, eps)
+        ious[i, :] = overlap / union
+    if exchange:
+        ious = ious.T
+    return ious
+
+
+def crop(clip, target, region):
+    cropped_image = []
+    for image in clip:
+        cropped_image.append(F.crop(image, *region))
+
+    target = target.copy()
+    i, j, h, w = region
+
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+
+    fields = ["labels", "area", "iscrowd"]
+
+    if "boxes" in target:
+        boxes = target["boxes"]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes?
+        target['masks'] = target['masks'][:, i:i + h, j:j + w]
+        fields.append("masks")
+
+    return cropped_image, target
+
+
+def hflip(clip, target):
+    flipped_image = []
+    for image in clip:
+        flipped_image.append(F.hflip(image))
+
+    w, h = clip[0].size
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+        target["boxes"] = boxes
+
+    if "masks" in target:
+        target['masks'] = target['masks'].flip(-1)
+    
+    return flipped_image, target
+
+def vflip(image,target):
+    flipped_image = []
+    for image in clip:
+        flipped_image.append(F.vflip(image))
+    w, h = clip[0].size
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [0, 3, 2, 1]] * torch.as_tensor([1, -1, 1, -1]) + torch.as_tensor([0, h, 0, h])
+        target["boxes"] = boxes
+
+    if "masks" in target:
+        target['masks'] = target['masks'].flip(1)
+
+    return flipped_image, target
+
+def resize(clip, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        return (oh, ow)
+
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+
+    size = get_size(clip[0].size, size, max_size)
+    rescaled_image = []
+    for image in clip:
+        rescaled_image.append(F.resize(image, size))
+
+    if target is None:
+        return rescaled_image, None
+
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image[0].size, clip[0].size))
+    ratio_width, ratio_height = ratios
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        target["boxes"] = scaled_boxes
+
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+
+    if "masks" in target:
+        if target['masks'].shape[0]>0:
+            target['masks'] = interpolate(
+                target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+        else:
+            target['masks'] = torch.zeros((target['masks'].shape[0],h,w))
+    return rescaled_image, target
+
+
+def pad(clip, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = []
+    for image in clip:
+        padded_image.append(F.pad(image, (0, 0, padding[0], padding[1])))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image[0].size[::-1])
+    if "masks" in target:
+        target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
+    return padded_image, target
+
+
+class RandomCrop(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        region = T.RandomCrop.get_params(img, self.size)
+        return crop(img, target, region)
+
+
+class RandomSizeCrop(object):
+    def __init__(self, min_size: int, max_size: int):
+        self.min_size = min_size
+        self.max_size = max_size
+
+    def __call__(self, img: PIL.Image.Image, target: dict):
+        w = random.randint(self.min_size, min(img[0].width, self.max_size))
+        h = random.randint(self.min_size, min(img[0].height, self.max_size))
+        region = T.RandomCrop.get_params(img[0], [h, w])
+        return crop(img, target, region)
+
+
+class CenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.))
+        crop_left = int(round((image_width - crop_width) / 2.))
+        return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
+
+
+class MinIoURandomCrop(object):
+    def __init__(self, min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3):
+        self.min_ious = min_ious
+        self.sample_mode = (1, *min_ious, 0)
+        self.min_crop_size = min_crop_size
+
+    def __call__(self, img, target):
+        w,h = img.size
+        while True:
+            mode = random.choice(self.sample_mode)
+            self.mode = mode
+            if mode == 1:
+                return img,target
+            min_iou = mode
+            boxes = target['boxes'].numpy()
+            labels = target['labels']
+
+            for i in range(50):
+                new_w = rand.uniform(self.min_crop_size * w, w)
+                new_h = rand.uniform(self.min_crop_size * h, h)
+                if new_h / new_w < 0.5 or new_h / new_w > 2:
+                    continue
+                left = rand.uniform(w - new_w)
+                top = rand.uniform(h - new_h)
+                patch = np.array((int(left), int(top), int(left + new_w), int(top + new_h)))
+                if patch[2] == patch[0] or patch[3] == patch[1]:
+                    continue
+                overlaps = bbox_overlaps(patch.reshape(-1, 4), boxes.reshape(-1, 4)).reshape(-1)
+                if len(overlaps) > 0 and overlaps.min() < min_iou:
+                    continue
+                
+                if len(overlaps) > 0:
+                    def is_center_of_bboxes_in_patch(boxes, patch):
+                        center = (boxes[:, :2] + boxes[:, 2:]) / 2
+                        mask = ((center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (center[:, 0] < patch[2]) * (center[:, 1] < patch[3]))
+                        return mask
+                    mask = is_center_of_bboxes_in_patch(boxes, patch)
+                    if False in mask:
+                        continue
+                    #TODO: use no center boxes
+                    #if not mask.any():
+                    #    continue
+
+                    boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
+                    boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
+                    boxes -= np.tile(patch[:2], 2)
+                    target['boxes'] = torch.tensor(boxes)
+                
+                img = np.asarray(img)[patch[1]:patch[3], patch[0]:patch[2]]
+                img = Image.fromarray(img)
+                width, height = img.size
+                target['orig_size'] = torch.tensor([height,width])
+                target['size'] = torch.tensor([height,width])
+                return img,target 
+
+
+class RandomContrast(object):
+    def __init__(self, lower=0.5, upper=1.5):
+        self.lower = lower
+        self.upper = upper
+        assert self.upper >= self.lower, "contrast upper must be >= lower."
+        assert self.lower >= 0, "contrast lower must be non-negative."
+    def __call__(self, image, target):
+        
+        if rand.randint(2):
+            alpha = rand.uniform(self.lower, self.upper)
+            image *= alpha
+        return image, target
+
+class RandomBrightness(object):
+    def __init__(self, delta=32):
+        assert delta >= 0.0
+        assert delta <= 255.0
+        self.delta = delta
+    def __call__(self, image, target):
+        if rand.randint(2):
+            delta = rand.uniform(-self.delta, self.delta)
+            image += delta
+        return image, target
+
+class RandomSaturation(object):
+    def __init__(self, lower=0.5, upper=1.5):
+        self.lower = lower
+        self.upper = upper
+        assert self.upper >= self.lower, "contrast upper must be >= lower."
+        assert self.lower >= 0, "contrast lower must be non-negative."
+
+    def __call__(self, image, target):
+        if rand.randint(2):
+            image[:, :, 1] *= rand.uniform(self.lower, self.upper)
+        return image, target
+
+class RandomHue(object): #
+    def __init__(self, delta=18.0):
+        assert delta >= 0.0 and delta <= 360.0
+        self.delta = delta
+
+    def __call__(self, image, target):
+        if rand.randint(2):
+            image[:, :, 0] += rand.uniform(-self.delta, self.delta)
+            image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
+            image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
+        return image, target
+
+class RandomLightingNoise(object):
+    def __init__(self):
+        self.perms = ((0, 1, 2), (0, 2, 1),
+                      (1, 0, 2), (1, 2, 0),
+                      (2, 0, 1), (2, 1, 0))
+    def __call__(self, image, target):
+        if rand.randint(2):
+            swap = self.perms[rand.randint(len(self.perms))]
+            shuffle = SwapChannels(swap)  # shuffle channels
+            image = shuffle(image)
+        return image, target
+
+class ConvertColor(object):
+    def __init__(self, current='BGR', transform='HSV'):
+        self.transform = transform
+        self.current = current
+
+    def __call__(self, image, target):
+        if self.current == 'BGR' and self.transform == 'HSV':
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+        elif self.current == 'HSV' and self.transform == 'BGR':
+            image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
+        else:
+            raise NotImplementedError
+        return image, target
+
+class SwapChannels(object):
+    def __init__(self, swaps):
+        self.swaps = swaps
+    def __call__(self, image):
+        image = image[:, :, self.swaps]
+        return image
+
+class PhotometricDistort(object):
+    def __init__(self):
+        self.pd = [
+            RandomContrast(),
+            ConvertColor(transform='HSV'),
+            RandomSaturation(),
+            RandomHue(),
+            ConvertColor(current='HSV', transform='BGR'),
+            RandomContrast()
+        ]
+        self.rand_brightness = RandomBrightness()
+        self.rand_light_noise = RandomLightingNoise()
+    
+    def __call__(self,clip,target):
+        imgs = []
+        for img in clip:
+            img = np.asarray(img).astype('float32')
+            img, target = self.rand_brightness(img, target)
+            if rand.randint(2):
+                distort = Compose(self.pd[:-1])
+            else:
+                distort = Compose(self.pd[1:])
+            img, target = distort(img, target)
+            img, target = self.rand_light_noise(img, target)
+            imgs.append(Image.fromarray(img.astype('uint8')))
+        return imgs, target
+
+# NOTICE: if used for mask, need to change
+class Expand(object):
+    def __init__(self, mean):
+        self.mean = mean
+    def __call__(self, clip, target):
+        if rand.randint(2):
+            return clip,target
+        imgs = []
+        masks = []
+        image = np.asarray(clip[0]).astype('float32')
+        height, width, depth = image.shape
+        ratio = rand.uniform(1, 4)
+        left = rand.uniform(0, width*ratio - width)
+        top = rand.uniform(0, height*ratio - height)
+        for i in range(len(clip)):
+            image = np.asarray(clip[i]).astype('float32')
+            expand_image = np.zeros((int(height*ratio), int(width*ratio), depth),dtype=image.dtype)
+            expand_image[:, :, :] = self.mean
+            expand_image[int(top):int(top + height),int(left):int(left + width)] = image
+            imgs.append(Image.fromarray(expand_image.astype('uint8')))
+            expand_mask = torch.zeros((int(height*ratio), int(width*ratio)),dtype=torch.uint8)
+            expand_mask[int(top):int(top + height),int(left):int(left + width)] = target['masks'][i]
+            masks.append(expand_mask)
+        boxes = target['boxes'].numpy()
+        boxes[:, :2] += (int(left), int(top))
+        boxes[:, 2:] += (int(left), int(top))
+        target['boxes'] = torch.tensor(boxes)
+        target['masks']=torch.stack(masks)
+        return imgs, target
+
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            # NOTE: caption for 'left' and 'right' should also change
+            caption = target['caption']
+            target['caption'] = caption.replace('left', '@').replace('right', 'left').replace('@', 'right')
+            return hflip(img, target)
+        return img, target
+
+class RandomVerticalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return vflip(img, target)
+        return img, target
+
+
+class RandomResize(object):
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        return resize(img, target, size, self.max_size)
+
+
+class RandomPad(object):
+    def __init__(self, max_pad):
+        self.max_pad = max_pad
+
+    def __call__(self, img, target):
+        pad_x = random.randint(0, self.max_pad)
+        pad_y = random.randint(0, self.max_pad)
+        return pad(img, target, (pad_x, pad_y))
+
+
+class RandomSelect(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    with probability p for transforms1 and (1 - p) for transforms2
+    """
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return self.transforms1(img, target)
+        return self.transforms2(img, target)
+
+
+class ToTensor(object):
+    def __call__(self, clip, target):
+        img = []
+        for im in clip:
+            img.append(F.to_tensor(im))
+        return img, target
+
+
+class RandomErasing(object):
+
+    def __init__(self, *args, **kwargs):
+        self.eraser = T.RandomErasing(*args, **kwargs)
+
+    def __call__(self, img, target):
+        return self.eraser(img), target
+
+
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, clip, target=None):
+        image = []
+        for im in clip:
+            image.append(F.normalize(im, mean=self.mean, std=self.std))
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image[0].shape[-2:]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            target["boxes"] = boxes
+        return image, target
+
+
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string