dianecy commited on Feb 13, 2025

Commit

3ec4928

verified ·

1 Parent(s): 3b5fc39

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.history/datasets/a2d_20250203174308.py +247 -0
.history/datasets/ytvos_ref_20250113131134.py +241 -0
.history/datasets/ytvos_ref_20250113131327.py +241 -0
.history/datasets/ytvos_ref_20250113141118.py +241 -0
.history/datasets/ytvos_ref_20250113162417.py +241 -0
.history/datasets/ytvos_ref_20250113163313.py +248 -0
.history/datasets/ytvos_ref_20250114201904.py +252 -0
.history/datasets/ytvos_ref_20250114201908.py +253 -0
.history/datasets/ytvos_ref_20250114202340.py +251 -0
.history/datasets/ytvos_ref_20250114205314.py +250 -0
.history/datasets/ytvos_ref_20250114211305.py +252 -0
.history/datasets/ytvos_ref_20250116074326.py +239 -0
.history/mbench/gpt_ref-ytvos-cy_20250121151513.py +433 -0
.history/mbench/gpt_ref-ytvos-revised_20250121160858.py +428 -0
.history/mbench/gpt_ref-ytvos_20250119070820.py +286 -0
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130183936.py +199 -0
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190533.py +429 -0
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190813.py +427 -0
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130220417.py +427 -0
.history/mbench/gpt_ref-ytvos_numbered_cy_20250201140559.py +461 -0
.history/mbench/gpt_ref-ytvos_numbered_cy_20250201141240.py +460 -0
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207172754.py +656 -0
.history/mbench/make_ref-ytvos_json_20250113182322.py +100 -0
.history/mbench/make_ref-ytvos_json_20250113182734.py +102 -0
.history/mbench/make_ref-ytvos_json_20250113182817.py +103 -0
.history/mbench/make_ref-ytvos_json_20250113182842.py +102 -0
.history/mbench/make_ref-ytvos_json_20250113183130.py +102 -0
.history/mbench/make_ref-ytvos_json_20250116141513.py +103 -0
.history/mbench/make_ref-ytvos_json_20250118024325.py +108 -0
.history/mbench/ytvos_ref_20250121152309.py +264 -0
.history/mbench_a2d/gpt_a2d_numbered_20250205111640.py +82 -0
.history/mbench_a2d/gpt_a2d_numbered_20250205122340.py +196 -0
.history/mbench_a2d/gpt_a2d_numbered_20250205152326.py +200 -0
.history/mbench_a2d/gpt_a2d_numbered_20250207110257.py +213 -0
.history/slurm_script/jupyter_20250121151552.sh +16 -0
.history/slurm_script/jupyter_20250121151643.sh +16 -0
.history/slurm_script/mbench_gpt_a2d_20250205122515.sh +19 -0
.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121155940.sh +18 -0
.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121160841.sh +18 -0
.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250124085144.sh +18 -0
.history/slurm_script/mbench_gpt_ref-ytvos_20250119070944.sh +18 -0
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130190228.sh +20 -0
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250201140706.sh +20 -0
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250202183206.sh +20 -0
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207171604.sh +20 -0
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207172920.sh +20 -0
hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/ca26d90c9e8e071d0bc31b570aef68306d0be1db4330471d10a117061a15a991.lock +0 -0
hf_cache/models--zhiqiulin--clip-flant5-xxl/.no_exist/89bad6fffe1126b24d4360c1e1f69145eb6103aa/pytorch_model.bin +0 -0
hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b +3 -0

.gitattributes CHANGED Viewed

@@ -46,3 +46,4 @@ LAVT-RIS/refer/data/refcocog/instances.json filter=lfs diff=lfs merge=lfs -text
 LAVT-RIS/refer/data/refcocog/refs(google).p filter=lfs diff=lfs merge=lfs -text
 LAVT-RIS/refer/data/refcocog/refs(umd).p filter=lfs diff=lfs merge=lfs -text
 LAVT-RIS/refer/evaluation/tokenizer/stanford-corenlp-3.4.1.jar filter=lfs diff=lfs merge=lfs -text

 LAVT-RIS/refer/data/refcocog/refs(google).p filter=lfs diff=lfs merge=lfs -text
 LAVT-RIS/refer/data/refcocog/refs(umd).p filter=lfs diff=lfs merge=lfs -text
 LAVT-RIS/refer/evaluation/tokenizer/stanford-corenlp-3.4.1.jar filter=lfs diff=lfs merge=lfs -text
+hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b filter=lfs diff=lfs merge=lfs -text

.history/datasets/a2d_20250203174308.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+A2D-Sentences data loader
+modified from https://github.com/mttr2021/MTTR/blob/main/datasets/a2d_sentences/a2d_sentences_dataset.py
+"""
+from pathlib import Path
+import torch
+from torchvision.io import read_video
+import torchvision.transforms.functional as F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+import h5py
+from pycocotools.mask import encode, area
+def get_image_id(video_id, frame_idx, ref_instance_a2d_id):
+    image_id = f'v_{video_id}_f_{frame_idx}_i_{ref_instance_a2d_id}'
+    return image_id
+class A2DSentencesDataset(Dataset):
+    """
+    A Torch dataset for A2D-Sentences.
+    For more information check out: https://kgavrilyuk.github.io/publication/actor_action/ or the original paper at:
+    https://arxiv.org/abs/1803.07485
+    """
+    def __init__(self, image_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int, subset):
+        super(A2DSentencesDataset, self).__init__()
+        dataset_path = str(image_folder)
+        self.mask_annotations_dir = os.path.join(dataset_path, 'text_annotations/a2d_annotation_with_instances')
+        self.videos_dir = os.path.join(dataset_path, 'Release/clips320H')
+        self.ann_file = ann_file
+        self.text_annotations = self.get_text_annotations()
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        self.subset = subset
+        print(f'\n {subset} sample num: ', len(self.text_annotations))
+        print('\n')
+    def get_text_annotations(self):
+        with open(str(self.ann_file), 'r') as f:
+            text_annotations_by_frame = [tuple(a) for a in json.load(f)]
+            return text_annotations_by_frame
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.text_annotations)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            text_query, video_id, frame_idx, instance_id = self.text_annotations[idx]
+            text_query = " ".join(text_query.lower().split())  # clean up the text query
+            # read the source window frames:
+            video_frames, _, _ = read_video(os.path.join(self.videos_dir, f'{video_id}.mp4'), pts_unit='sec')  # (T, H, W, C)
+            vid_len = len(video_frames)
+            # note that the original a2d dataset is 1 indexed, so we have to subtract 1 from frame_idx
+            frame_id = frame_idx - 1
+            if self.subset == 'train':
+                # get a window of window_size frames with frame frame_id in the middle.
+                num_frames = self.num_frames
+                # random sparse sample
+                sample_indx = [frame_id]
+                # local sample
+                sample_id_before = random.randint(1, 3)
+                sample_id_after = random.randint(1, 3)
+                local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
+                sample_indx.extend(local_indx)
+                # global sampling
+                if num_frames > 3:
+                    all_inds = list(range(vid_len))
+                    global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
+                    global_n = num_frames - len(sample_indx)
+                    if len(global_inds) > global_n:
+                        select_id = random.sample(range(len(global_inds)), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(global_inds[s_id])
+                    elif vid_len >=global_n:  # sample long range global frames
+                        select_id = random.sample(range(vid_len), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+                    else:
+                        select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+                sample_indx.sort()
+                # find the valid frame index in sampled frame list, there is only one valid frame
+                valid_indices = sample_indx.index(frame_id)
+            elif self.subset == 'val':
+                start_idx, end_idx = frame_id - self.num_frames // 2, frame_id + (self.num_frames + 1) // 2
+                sample_indx = []
+                for i in range(start_idx, end_idx):
+                    i = min(max(i, 0), len(video_frames)-1)  # pad out of range indices with edge frames
+                    sample_indx.append(i)
+                sample_indx.sort()
+                # find the valid frame index in sampled frame list, there is only one valid frame
+                valid_indices = sample_indx.index(frame_id)
+            # read frames
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for j in range(self.num_frames):
+                frame_indx = sample_indx[j]
+                img = F.to_pil_image(video_frames[frame_indx].permute(2, 0, 1))
+                imgs.append(img)
+            # read the instance mask
+            frame_annot_path = os.path.join(self.mask_annotations_dir, video_id, f'{frame_idx:05d}.h5')
+            f = h5py.File(frame_annot_path)
+            instances = list(f['instance'])
+            instance_idx = instances.index(instance_id)  # existence was already validated during init
+            instance_masks = np.array(f['reMask'])
+            if len(instances) == 1:
+                instance_masks = instance_masks[np.newaxis, ...]
+            instance_masks = torch.tensor(instance_masks).transpose(1, 2)
+            mask_rles = [encode(mask) for mask in instance_masks.numpy()]
+            mask_areas = area(mask_rles).astype(float)
+            f.close()
+            # select the referred mask
+            label = torch.tensor(0, dtype=torch.long)
+            mask = instance_masks[instance_idx].numpy()
+            if (mask > 0).any():
+                y1, y2, x1, x2 = self.bounding_box(mask)
+                box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                valid.append(1)
+            else: # some frame didn't contain the instance
+                box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                valid.append(0)
+            mask = torch.from_numpy(mask)
+            labels.append(label)
+            boxes.append(box)
+            masks.append(mask)
+            # transform
+            h, w = instance_masks.shape[-2:]
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            # there is only one valid frame
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'valid_indices': torch.tensor([valid_indices]),
+                'labels': labels,                        # [1,]
+                'boxes': boxes,                          # [1, 4], xyxy
+                'masks': masks,                          # [1, H, W]
+                'valid': torch.tensor(valid),            # [1,]
+                'caption': text_query,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)]),
+                'image_id': get_image_id(video_id,frame_idx, instance_id)
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.a2d_path)
+    assert root.exists(), f'provided A2D-Sentences path {root} does not exist'
+    PATHS = {
+        "train": (root, root / "a2d_sentences_single_frame_train_annotations.json"),
+        "val": (root, root / "a2d_sentences_single_frame_test_annotations.json"),
+    }
+    img_folder, ann_file = PATHS[image_set]
+    #dataset = A2DSentencesDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size),
+    #                            return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
+    dataset = A2DSentencesDataset(img_folder, ann_file, transforms=None,
+                                return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
+    return dataset

.history/datasets/ytvos_ref_20250113131134.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            print(vid_meta)
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                print(exp_dict)
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                meta = {
+                    'video': vid,
+                    'exp': exp_dict['exp'],
+                    'obj_id': int(exp_dict['obj_id']),
+                    'frames': vid_frames,
+                    'bins': bins,
+                    'category': vid_meta['objects'][int(exp_dict['obj_id'])]['category']
+                }
+                self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113131327.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            print(vid_meta)
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                print(exp_dict)
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                meta = {
+                    'video': vid,
+                    'exp': exp_dict['exp'],
+                    'obj_id': int(exp_dict['obj_id']),
+                    'frames': vid_frames,
+                    'bins': bins,
+                    'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                }
+                self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113141118.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            print(vid_meta)
+            print(vid_data)
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                meta = {
+                    'video': vid,
+                    'exp': exp_dict['exp'],
+                    'obj_id': int(exp_dict['obj_id']),
+                    'frames': vid_frames,
+                    'bins': bins,
+                    'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                }
+                self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113162417.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for frame_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113163313.py ADDED Viewed

	@@ -0,0 +1,248 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for frame_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114201904.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            print(f"vid_data: {vid_data}")
+            print(f"vid_meta: {vid_meta}")
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for sample_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'sample_id' : sample_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114201908.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            print(f"vid_data: {vid_data}")
+            print(f"vid_meta: {vid_meta}")
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for sample_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'sample_id' : sample_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114202340.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.vid_data, self.vid_meta = self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            return vid_meta, vid_data
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for sample_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'sample_id' : sample_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114205314.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for sample_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'sample_id' : sample_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114211305.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                print(obj_id, type(obj_id))
+                print(vid_meta['objects'].keys())
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta[obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250116074326.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        meta = self.metas[idx]  # dict
+        video, sample_indx, bins, frames, obj_id_cat = \
+            meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+        # read frames and masks
+        imgs, labels, boxes, masks, valid = [], [], [], [], []
+        for frame_indx in sample_indx:
+            frame_name = frames[frame_indx]
+            img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+            mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+            img = Image.open(img_path).convert('RGB')
+            imgs.append(img)
+            mask = Image.open(mask_path).convert('P')
+            mask = np.array(mask)
+            # create the target
+            for obj_id in list(obj_id_cat.keys()):
+                obj_mask = (mask==int(obj_id)).astype(np.float32) # 0,1 binary
+                if (obj_mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(obj_mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                obj_mask = torch.from_numpy(obj_mask)
+                # append
+                masks.append(obj_mask)
+                boxes.append(box)
+        # transform
+        w, h = img.size
+        boxes = torch.stack(boxes, dim=0)
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        masks = torch.stack(masks, dim=0)
+        target = {
+            'frames_idx': sample_indx, # [T,]
+            'boxes': boxes,                          # [T, 4], xyxy
+            'masks': masks,                          # [T, H, W]
+            'valid': torch.tensor(valid),            # [T,]
+            'obj_ids' : list(obj_id_cat.keys()),
+            'orig_size': torch.as_tensor([int(h), int(w)]),
+            'size': torch.as_tensor([int(h), int(w)])
+        }
+        # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+        if self._transforms:
+            imgs, target = self._transforms(imgs, target)
+            imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+        else:
+            imgs = np.array(imgs)
+            imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+        #  # FIXME: handle "valid", since some box may be removed due to random crop
+        # if torch.any(target['valid'] == 1):  # at leatst one instance
+        #     instance_check = True
+        # else:
+        #     idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/mbench/gpt_ref-ytvos-cy_20250121151513.py ADDED Viewed

	@@ -0,0 +1,433 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+# Captioner
+ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+]
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    cat_names = set()
+    all_captions = dict()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    # cat_names : person, snowboard
+    # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
+    # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
+    for cat_name in list(cat_names) :
+        image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+        image_captions = {}
+        captioner = OpenAI()
+        #0단계: action의 대상이 될 수 있는가?
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        # response_check = captioner.chat.completions.create(
+        #     model="gpt-4o",
+        #     messages=[
+        #         {
+        #             "role": "user",
+        #             "content": f"""
+        #                 Can a {cat_name} be a subject of distinct actions or movements?
+        #                 For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
+        #                 However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
+        #                 Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
+        #                 Answer only YES or NONE.
+        #             """
+        #         }
+        #     ],
+        # )
+        # response_check_content = response_check.choices[0].message.content.strip().lower()
+        # print(f"Movable Check for {cat_name}: {response_check_content}")
+        # if response_check_content == "yes": is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.")
+            continue
+        for i in range(len(image_paths)):
+            image_path = image_paths[i]
+            frame_name = frame_names[i]
+            base64_image = encode_image(image_path)
+            #1단계: 필터링
+            print(cat_name, frame_name)
+            response1 = captioner.chat.completions.create(
+                model="gpt-4o",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
+                                        Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
+                                        Each action should be unique and clearly associated with a specific object.
+                                        Respond with YES if:
+                                        - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
+                                        - The {cat_name}s involve clear, distinguishable actions performed independently.
+                                        Respond with NONE if:
+                                        - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
+                                        - Actions are ambiguous, minor, or not clearly visible.
+                                        If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
+                                        If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
+                                        Answer only YES or NONE."""
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            response_content = response1.choices[0].message.content
+            should_caption = True if "yes" in response_content.lower() else False
+            print(f"are {cat_name}s distinguished by action: {response_content}")
+            #2단계: dense caption 만들기
+            if should_caption:
+                response2 = captioner.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": f"""
+                                            Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
+                                            1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                            2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                            3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                            4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                            5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                            6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                            7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                            8. Include interactions with objects or other entities when they are prominent and observable.
+                                            9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                            Output only the caption.""",
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                )
+                caption = response2.choices[0].message.content
+                print(f"{image_path} - {frame_name}: {caption}")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = []
+    valid_cat_names = list(all_captions.keys())
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat = video_data['annotations'][0][obj_id]['category_name']
+        if cat in valid_cat_names : valid_obj_ids.append(obj_id)
+    return all_captions, valid_obj_ids
+# Referring expression generator and QA filter
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+    x_min, y_min, x_max, y_max = bbox
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    #cropped object for visibility check
+    cropped_I = I[y_min:y_max, x_min:x_max]
+    pil_cropped_I = Image.fromarray(cropped_I)
+    buff_crop = BytesIO()
+    pil_cropped_I.save(buff_crop, format='JPEG')
+    base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
+    #entire image for referring expression generation
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+    # 구분 가능 여부 확인
+    generator = OpenAI()
+    response_check = generator.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
+                                    Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
+                                    Guidelines:
+                                    - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
+                                    - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
+                                    - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
+                                    Output only either YES or NONE.
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
+                    }
+                ]
+            },
+        ]
+    )
+    response_check_content = response_check.choices[0].message.content.strip().lower()
+    print(f"is object {obj_id} visible: {response_check_content}")
+    if "yes" not in response_check_content:
+        print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
+        return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
+    # Referring expression 만들기
+    # generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
+                        Guidelines for creating the referring expression:
+                        1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
+                        2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
+                        3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
+                        4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
+                        5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
+                        6. Use '{cat_name}' as the noun for the referring expressions.
+                        Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
+                        {caption}
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                    # {
+                    #     "type": "image_url",
+                    #     "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
+                    # }
+                ],
+            }
+        ],
+    )
+    ref_exp = response.choices[0].message.content.strip()
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="gpt-4o",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response2_content = response2.choices[0].message.content
+    describesNotHighlighted = True if "yes" in response2_content.lower() else False
+    isValid = True if describesHighlighted and not describesNotHighlighted else False
+    print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # # 전체 데이터셋
+    # train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # # 전체 데이터셋 메타데이터
+    # metas = train_dataset.metas
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+    vid_ids = list(data.keys())
+    all_ref_exps = {}
+    #==================GPT 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    # 전체 데이터셋의 vid_id에 대해
+    for i in range(1):
+        vid_id = vid_ids[i]
+        #====캡션 만들기====
+        caption, valid_obj_ids = getCaption(vid_id, data)
+        cats_in_vid = list(caption.keys())
+        #====referring expression 만들고 QA filtering====
+        ref_expressions = {}
+        # 각 카테고리별로
+        for cat_name in cats_in_vid:
+            if cat_name not in ref_expressions:
+                ref_expressions[cat_name] = {}
+            # 각 비디오 프레임 별로
+            for frame_name in data[vid_id]['frame_names']:
+                if frame_name not in ref_expressions[cat_name]:
+                    ref_expressions[cat_name][frame_name] = {}  # Create frame-level dictionary
+                caption = caption[cat_name][frame_name]
+                if not caption : continue
+                else :
+                    # 각 obj id별로
+                    for obj_id in valid_obj_ids:
+                        ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
+                        ref_expressions[cat_name][frame_name][obj_id] = ref_exp  # Store ref_exp
+        all_ref_exps[vid_id] = ref_expressions
+    with open('mbench/result-cy.json', 'w') as file:
+        json.dump(all_ref_exps, file)

.history/mbench/gpt_ref-ytvos-revised_20250121160858.py ADDED Viewed

	@@ -0,0 +1,428 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+# Captioner
+ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+]
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    cat_names = set()
+    all_captions = dict()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    # cat_names : person, snowboard
+    # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
+    # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
+    for cat_name in list(cat_names) :
+        image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+        image_captions = {}
+        captioner = OpenAI()
+        #0단계: action의 대상이 될 수 있는가?
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        # response_check = captioner.chat.completions.create(
+        #     model="gpt-4o",
+        #     messages=[
+        #         {
+        #             "role": "user",
+        #             "content": f"""
+        #                 Can a {cat_name} be a subject of distinct actions or movements?
+        #                 For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
+        #                 However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
+        #                 Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
+        #                 Answer only YES or NONE.
+        #             """
+        #         }
+        #     ],
+        # )
+        # response_check_content = response_check.choices[0].message.content.strip().lower()
+        # print(f"Movable Check for {cat_name}: {response_check_content}")
+        # if response_check_content == "yes": is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.")
+            continue
+        for i in range(len(image_paths)):
+            image_path = image_paths[i]
+            frame_name = frame_names[i]
+            base64_image = encode_image(image_path)
+            #1단계: 필터링
+            #print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+            response1 = captioner.chat.completions.create(
+                model="chatgpt-4o-latest",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
+                                        Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
+                                        Each action should be unique and clearly associated with a specific object.
+                                        Respond with YES if:
+                                        - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
+                                        - The {cat_name}s involve clear, distinguishable actions performed independently.
+                                        Respond with NONE if:
+                                        - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
+                                        - Actions are ambiguous, minor, or not clearly visible.
+                                        If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
+                                        If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
+                                        Answer only YES or NONE."""
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            response_content = response1.choices[0].message.content
+            should_caption = True if "yes" in response_content.lower() else False
+            #print(f"are {cat_name}s distinguished by action: {response_content}")
+            #2단계: dense caption 만들기
+            if should_caption:
+                response2 = captioner.chat.completions.create(
+                    model="chatgpt-4o-latest",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": f"""
+                                            Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
+                                            1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                            2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                            3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                            4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                            5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                            6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                            7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                            8. Include interactions with objects or other entities when they are prominent and observable.
+                                            9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                            Output only the caption.""",
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                )
+                caption = response2.choices[0].message.content
+                #print(f"{image_path} - {frame_name}: {caption}")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = []
+    valid_cat_names = list(all_captions.keys())
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat = video_data['annotations'][0][obj_id]['category_name']
+        if cat in valid_cat_names : valid_obj_ids.append(obj_id)
+    return all_captions, valid_obj_ids
+# Referring expression generator and QA filter
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+    x_min, y_min, x_max, y_max = bbox
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    #cropped object for visibility check
+    cropped_I = I[y_min:y_max, x_min:x_max]
+    pil_cropped_I = Image.fromarray(cropped_I)
+    buff_crop = BytesIO()
+    pil_cropped_I.save(buff_crop, format='JPEG')
+    base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
+    #entire image for referring expression generation
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+    # 구분 가능 여부 확인
+    generator = OpenAI()
+    response_check = generator.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
+                                    Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
+                                    Guidelines:
+                                    - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
+                                    - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
+                                    - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
+                                    Output only either YES or NONE.
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
+                    }
+                ]
+            },
+        ]
+    )
+    response_check_content = response_check.choices[0].message.content.strip().lower()
+    #print(f"is object {obj_id} visible: {response_check_content}")
+    if "yes" not in response_check_content:
+        print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
+        return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
+    # Referring expression 만들기
+    # generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
+                        Guidelines for creating the referring expression:
+                        1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
+                        2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
+                        3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
+                        4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
+                        5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
+                        6. Use '{cat_name}' as the noun for the referring expressions.
+                        Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
+                        {caption}
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                    # {
+                    #     "type": "image_url",
+                    #     "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
+                    # }
+                ],
+            }
+        ],
+    )
+    ref_exp = response.choices[0].message.content.strip()
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response2_content = response2.choices[0].message.content
+    notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
+    isValid = True if describesHighlighted and notDescribesNotHighlighted else False
+    #print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
+    #print(f"ref exp: {ref_exp}")
+    #print("")
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+    vid_ids = list(data.keys())
+    all_ref_exps = {}
+    os.environ['OPENAI_API_KEY'] = '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'
+    # 전체 데이터셋의 vid_id에 대해
+    for i in range(50):
+        vid_id = vid_ids[i]
+        #====캡션 만들기====
+        # print("=====================captioner========================")
+        captions, valid_obj_ids = getCaption(vid_id, data)
+        cats_in_vid = list(captions.keys())
+        # print()
+        #====referring expression 만들고 QA filtering====
+        # print("=====================referring expression generator & QA filter========================")
+        ref_expressions = {}
+        # 각 카테고리별로
+        for cat_name in cats_in_vid:
+            if cat_name not in ref_expressions:
+                ref_expressions[cat_name] = {}
+            # 각 비디오 프레임 별로
+            for frame_name in data[vid_id]['frame_names']:
+                # print(f'--------category: {cat_name}, frame_name: {frame_name}')
+                if frame_name not in ref_expressions[cat_name]:
+                    ref_expressions[cat_name][frame_name] = {}  # Create frame-level dictionary
+                caption = captions[cat_name][frame_name]
+                if not caption : continue
+                else :
+                    # 각 obj id별로
+                    for obj_id in valid_obj_ids:
+                        ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
+                        ref_expressions[cat_name][frame_name][obj_id] = ref_exp  # Store ref_exp
+        all_ref_exps[vid_id] = ref_expressions
+    with open('mbench/result_revised50.json', 'w') as file:
+        json.dump(all_ref_exps, file, indent=4)

.history/mbench/gpt_ref-ytvos_20250119070820.py ADDED Viewed

	@@ -0,0 +1,286 @@

+from datasets import build_dataset
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+os.environ['OPENAI_API_KEY'] = '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'
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    cat_names = set()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    if len(cat_names) == 1:
+        cat_name = next(iter(cat_names))
+    else:
+        print("more than 2 categories")
+        return -1
+    image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+    image_captions = {}
+    captioner = OpenAI()
+    for i in range(len(image_paths)):
+        image_path = image_paths[i]
+        frame_name = frame_names[i]
+        base64_image = encode_image(image_path)
+        #1단계: 필터링
+        response1 = captioner.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        response_content = response1.choices[0].message.content
+        should_caption = True if "yes" in response_content.lower() else False
+        #2단계: dense caption 만들기
+        if should_caption:
+            response2 = captioner.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""
+                                    Describe the image in detail focusing on the {cat_name}s' actions.
+                                    1. Each action should be prominent, clear and unique, describing the corresponding object only.
+                                    2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
+                                    3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
+                                    4. Do not include actions that needs to be guessed or suggested.""",
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            caption = response2.choices[0].message.content
+        else:
+            caption = None
+        image_captions[frame_name] = caption
+    return image_captions
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+    x_min, y_min, x_max, y_max = bbox
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+    #ref expression 만들기
+    generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
+                                1. The referring expression describes the action and does not contain information about appearance or location in the picture.
+                                2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
+                                3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
+                                4. The referring expression should only describe the highlighted {cat_name} and not any other.
+                                5. Use '{cat_name}' as the noun for the referring expressions.
+                                Output only the referring expression.
+                                {caption}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    ref_exp = response.choices[0].message.content
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response2_content = response2.choices[0].message.content
+    describesNotHighlighted = True if "yes" in response2_content.lower() else False
+    isValid = True if describesHighlighted and not describesNotHighlighted else False
+    print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+def createRefExp(video_id, json_data):
+    video_data = json_data[video_id]
+    obj_ids = list(video_data['annotations'][0].keys())
+    frame_names = video_data['frame_names']
+    captions_per_frame = getCaption(video_id, json_data)
+    if captions_per_frame == -1:
+        print("There are more than 2 cateories")
+        return
+    video_ref_exps = {}
+    for frame_name in frame_names:
+        frame_caption = captions_per_frame[frame_name]
+        if frame_caption == None:
+            video_ref_exps[frame_name] = None
+        else:
+            frame_ref_exps = {}
+            for obj_id in obj_ids:
+                exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
+                frame_ref_exps[obj_id] = exp_per_obj
+            video_ref_exps[frame_name] = frame_ref_exps
+    return video_ref_exps
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+    videos = set()
+    with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
+        manual_select = list(file)
+    for frame in manual_select:
+        result = json.loads(frame)
+        videos.add(result['video'])
+    videos = list(videos)
+    all_video_refs = {}
+    for i in range(10):
+        video_id = videos[i]
+        video_ref = createRefExp(video_id, data)
+        all_video_refs[video_id] = video_ref

.history/mbench/gpt_ref-ytvos_numbered_cy_20250130183936.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import os
+import sys
+from os import path as osp
+from io import BytesIO
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, vid_cat_cnts, contoured_frames
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]

.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190533.py ADDED Viewed

	@@ -0,0 +1,429 @@

+import os
+import sys
+from os import path as osp
+from io import BytesIO
+from ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+import json
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, vid_cat_cnts, contoured_frames
+def getCaption(idx, color_mask=True):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
+    marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Each action is unambiguously recognizable and distinct.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) They show no noticeable action beyond standing or minor movements.
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    model="chatgpt-4o-latest",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
+                                        In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+                                        I want to use your expressions to create a action-centric referring expression dataset.
+                                        Therefore, your expressions for these {cat_name}s should describe unique action of each object.
+                                        1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                        2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                        3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                        4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                        5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                        6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                        7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                        8. Include interactions with objects or other entities when they are prominent and observable.
+                                        9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                        10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
+                                        11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
+                                        12. Do not mention object IDs.
+                                        13. Use '{cat_name}' as the noun for the referring expressions.
+                                        Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+                                        Output referring expressions for each object id.
+                                        """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+            I want to use your expressions to create a action-centric referring expression dataset.
+            Please describe each {cat_name} using **clearly observable** and **specific** actions.
+            ## Guidelines:
+            1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
+            2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
+            3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
+            4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
+                Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
+            5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+            6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
+            7. Base your description on the following action definitions:
+            - Facial with object manipulation
+            - General body movement, body position or pattern
+            - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
+            - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
+            ## Output Format:
+            - For each labeled {cat_name}, output one line in the format:
+            ID. action-oriented description
+            Example:
+            1. a bear grasping the edge of a wood with its front paws
+            2. the bear pushing another bear, leaning forward
+            **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
+            **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            For each labeled {cat_name}, output referring expressions for each object id.
+            """
+            if should_caption:
+                response2 = captioner.chat.completions.create(
+                    model="chatgpt-4o-latest",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": dense_caption_prompt,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                )
+                caption = response2.choices[0].message.content
+                #print(f"{image_path} - {frame_name}: {caption}")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return vid_id, all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
+    args = parser.parse_args()
+    print(args.save_caption_path, flush=True)
+    print(args.save_valid_obj_ids_path, flush=True)
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i, True)
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190813.py ADDED Viewed

	@@ -0,0 +1,427 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from os import path as osp
+from io import BytesIO
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+import json
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, vid_cat_cnts, contoured_frames
+def getCaption(idx, color_mask=True):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
+    marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Each action is unambiguously recognizable and distinct.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) They show no noticeable action beyond standing or minor movements.
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    model="chatgpt-4o-latest",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
+                                        In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+                                        I want to use your expressions to create a action-centric referring expression dataset.
+                                        Therefore, your expressions for these {cat_name}s should describe unique action of each object.
+                                        1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                        2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                        3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                        4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                        5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                        6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                        7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                        8. Include interactions with objects or other entities when they are prominent and observable.
+                                        9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                        10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
+                                        11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
+                                        12. Do not mention object IDs.
+                                        13. Use '{cat_name}' as the noun for the referring expressions.
+                                        Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+                                        Output referring expressions for each object id.
+                                        """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+            I want to use your expressions to create a action-centric referring expression dataset.
+            Please describe each {cat_name} using **clearly observable** and **specific** actions.
+            ## Guidelines:
+            1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
+            2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
+            3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
+            4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
+                Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
+            5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+            6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
+            7. Base your description on the following action definitions:
+            - Facial with object manipulation
+            - General body movement, body position or pattern
+            - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
+            - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
+            ## Output Format:
+            - For each labeled {cat_name}, output one line in the format:
+            ID. action-oriented description
+            Example:
+            1. a bear grasping the edge of a wood with its front paws
+            2. the bear pushing another bear, leaning forward
+            **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
+            **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            For each labeled {cat_name}, output referring expressions for each object id.
+            """
+            if should_caption:
+                response2 = captioner.chat.completions.create(
+                    model="chatgpt-4o-latest",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": dense_caption_prompt,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                )
+                caption = response2.choices[0].message.content
+                #print(f"{image_path} - {frame_name}: {caption}")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return vid_id, all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i, True)
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

.history/mbench/gpt_ref-ytvos_numbered_cy_20250130220417.py ADDED Viewed

	@@ -0,0 +1,427 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from os import path as osp
+from io import BytesIO
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+import json
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, vid_cat_cnts, contoured_frames
+def getCaption(idx, color_mask=True):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
+    marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Each action is unambiguously recognizable and distinct.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) They show no noticeable action beyond standing or minor movements.
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
+                                        In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+                                        I want to use your expressions to create a action-centric referring expression dataset.
+                                        Therefore, your expressions for these {cat_name}s should describe unique action of each object.
+                                        1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                        2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                        3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                        4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                        5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                        6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                        7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                        8. Include interactions with objects or other entities when they are prominent and observable.
+                                        9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                        10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
+                                        11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
+                                        12. Do not mention object IDs.
+                                        13. Use '{cat_name}' as the noun for the referring expressions.
+                                        Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+                                        Output referring expressions for each object id.
+                                        """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+            I want to use your expressions to create a action-centric referring expression dataset.
+            Please describe each {cat_name} using **clearly observable** and **specific** actions.
+            ## Guidelines:
+            1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
+            2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
+            3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
+            4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
+                Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
+            5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+            6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
+            7. Base your description on the following action definitions:
+            - Facial with object manipulation
+            - General body movement, body position or pattern
+            - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
+            - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
+            ## Output Format:
+            - For each labeled {cat_name}, output one line in the format:
+            ID. action-oriented description
+            Example:
+            1. a bear grasping the edge of a wood with its front paws
+            2. the bear pushing another bear, leaning forward
+            **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
+            **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            For each labeled {cat_name}, output referring expressions for each object id.
+            """
+            if should_caption:
+                response2 = captioner.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": dense_caption_prompt,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                )
+                caption = response2.choices[0].message.content
+                #print(f"{image_path} - {frame_name}: {caption}")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return vid_id, all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i, True)
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

.history/mbench/gpt_ref-ytvos_numbered_cy_20250201140559.py ADDED Viewed

	@@ -0,0 +1,461 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+from os import path as osp
+from io import BytesIO
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+import json
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, vid_cat_cnts, contoured_frames
+def getCaption(idx, model='gpt-4o', color_mask=True):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
+    #marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                    (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
+                3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    # model="chatgpt-4o-latest",
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
+                                        In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+                                        I want to use your expressions to create a action-centric referring expression dataset.
+                                        Therefore, your expressions for these {cat_name}s should describe unique action of each object.
+                                        1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                        2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                        3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                        4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                        5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                        6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                        7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                        8. Include interactions with objects or other entities when they are prominent and observable.
+                                        9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                        10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
+                                        11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
+                                        12. Do not mention object IDs.
+                                        13. Use '{cat_name}' as the noun for the referring expressions.
+                                        Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+                                        Output referring expressions for each object id.
+                                        """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+            I want to use your expressions to create an **action-centric referring expression** dataset.
+            Please describe each {cat_name} using **clearly observable** and **specific** actions.
+            ---
+            ## Guidelines:
+            1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
+            2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
+            3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
+            4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+            5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+            6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.
+            7. Base your description on these action definitions:
+            - Avoid using term 'minimal' or 'slightly'.
+            - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
+            - details such as motion and intention, facial with object manipulation
+            - movements with objects or other entities when they are prominent and observable. expression should be specific.
+                (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
+            ---
+            ## Output Format:
+            - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
+                object id. using {cat_name} as subject noun, action-oriented description
+                (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
+            - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
+            ### Example
+            If the frame has 2 labeled bears, your output should look like:
+            1. the bear reaching his right arm while leaning forward to capture the prey
+            2. a bear standing upright facing right, touching the bike aside
+            ---
+            **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
+            **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
+            **Do not include markdown** in the output.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            For each labeled {cat_name}, output referring expressions for each object id.
+            """
+            MAX_RETRIES = 2
+            retry_count = 0
+            if should_caption:
+                while retry_count < MAX_RETRIES:
+                    response2 = captioner.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": dense_caption_prompt,
+                                    },
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+                    # caption = response2.choices[0].message.content
+                    #print(f"{image_path} - {frame_name}: {caption}")
+                    caption = response2.choices[0].message.content.strip()
+                    caption_lower = caption.lower().lstrip()
+                    if caption_lower.startswith("1.") and not any(
+                        phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+                    ):
+                        break
+                    print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+                    retry_count += 1
+                    time.sleep(2)
+                if retry_count == MAX_RETRIES:
+                    caption = None
+                    print("Max retries reached. Caption generation failed.")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i, True)
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

.history/mbench/gpt_ref-ytvos_numbered_cy_20250201141240.py ADDED Viewed

	@@ -0,0 +1,460 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+from os import path as osp
+from io import BytesIO
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+import json
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, vid_cat_cnts, contoured_frames
+def getCaption(idx, model='gpt-4o', color_mask=True):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
+    #marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                    (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
+                3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
+                                        In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+                                        I want to use your expressions to create a action-centric referring expression dataset.
+                                        Therefore, your expressions for these {cat_name}s should describe unique action of each object.
+                                        1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                        2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                        3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                        4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                        5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                        6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                        7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                        8. Include interactions with objects or other entities when they are prominent and observable.
+                                        9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                        10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
+                                        11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
+                                        12. Do not mention object IDs.
+                                        13. Use '{cat_name}' as the noun for the referring expressions.
+                                        Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+                                        Output referring expressions for each object id.
+                                        """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+            I want to use your expressions to create an **action-centric referring expression** dataset.
+            Please describe each {cat_name} using **clearly observable** and **specific** actions.
+            ---
+            ## Guidelines:
+            1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
+            2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
+            3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
+            4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+            5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+            6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.
+            7. Base your description on these action definitions:
+            - Avoid using term 'minimal' or 'slightly'.
+            - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
+            - details such as motion and intention, facial with object manipulation
+            - movements with objects or other entities when they are prominent and observable. expression should be specific.
+                (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
+            ---
+            ## Output Format:
+            - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
+                object id. using {cat_name} as subject noun, action-oriented description
+                (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
+            - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
+            ### Example
+            If the frame has 2 labeled bears, your output should look like:
+            1. the bear reaching his right arm while leaning forward to capture the prey
+            2. a bear standing upright facing right, touching the bike aside
+            ---
+            **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
+            **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
+            **Do not include markdown** in the output.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            For each labeled {cat_name}, output referring expressions for each object id.
+            """
+            MAX_RETRIES = 2
+            retry_count = 0
+            if should_caption:
+                while retry_count < MAX_RETRIES:
+                    response2 = captioner.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": dense_caption_prompt,
+                                    },
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+                    # caption = response2.choices[0].message.content
+                    #print(f"{image_path} - {frame_name}: {caption}")
+                    caption = response2.choices[0].message.content.strip()
+                    caption_lower = caption.lower().lstrip()
+                    if caption_lower.startswith("1.") and not any(
+                        phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+                    ):
+                        break
+                    print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+                    retry_count += 1
+                    time.sleep(2)
+                if retry_count == MAX_RETRIES:
+                    caption = None
+                    print("Max retries reached. Caption generation failed.")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i, True)
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207172754.py ADDED Viewed

	@@ -0,0 +1,656 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+from os import path as osp
+from io import BytesIO
+import random
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+import json
+import requests
+from openai.error import APIConnectionError, OpenAIError
+def number_objects_and_encode_old(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1
+                        text_y = center_y
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, contoured_frames, vid_cat_cnts
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                    cv2.drawContours(frame, contours, -1, colors[j], 3)
+                    cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                    if len(contours) > 0:
+                        largest_contour = max(contours, key=cv2.contourArea)
+                        M = cv2.moments(largest_contour)
+                        if M["m00"] != 0:
+                            center_x = int(M["m10"] / M["m00"])
+                            center_y = int(M["m01"] / M["m00"])
+                        else:
+                            center_x, center_y = 0, 0
+                    font = cv2.FONT_HERSHEY_SIMPLEX
+                    text = obj_id
+                    font_scale = 1.2
+                    text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                    text_x = center_x - text_size[0] // 1
+                    text_y = center_y
+                    rect_start = (text_x - 5, text_y - text_size[1] - 5)
+                    rect_end = (text_x + text_size[0] + 5, text_y + 3)
+                    contour_thickness = 1
+                    rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
+                    rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
+                    cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
+                    cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                    cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+                    if color_mask:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, contoured_frames, vid_cat_cnts
+def getCaption(idx, model='gpt-4o'):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    # color_mask = random.choice([True, False])
+    color_mask = random.choices([False, True], weights=[60, 40])[0]
+    base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
+    #marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        # cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            # cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                    (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
+                3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
+            Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
+            1. Focus only on clear, unique, and prominent actions that distinguish each object.
+            2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+            3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+            4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+            5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+            6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                - expressions like 'seems to be', 'appears to be' are BANNED!
+            7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+            8. Include interactions with objects or other entities when they are prominent and observable.
+            9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
+            10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
+            11. Do not mention object IDs.
+            12. Use '{cat_name}' as the noun for the referring expressions.
+            Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+            - Your answer should contain details, and follow the following format:
+                object id. action-oriented description
+                (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
+                      2. a person bending over and touching his boots to tie the shoelace.)
+            - for action-oriented description, use {cat_name} as subject noun
+            **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
+            Please pay attention to the categories of these objects and don’t change them.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            Output referring expressions for each object id. Please start your answer:"""
+            dense_caption_prompt_2 = f"""
+            You are an advanced visual language model analyzing a video frame.
+            In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
+            Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
+            Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
+            ---
+            ## Key Guidelines:
+            1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
+            - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
+            - Avoid: "moving slightly to the side" (**(X) Too vague**)
+            2. **Do not describe appearance, color, or position**—focus purely on the action.
+            - (X) "A large brown bear standing on the left"
+            - (O) "The bear is lifting its front paws and swiping forward."
+            3. **Use dynamic, action-specific verbs** rather than passive descriptions.
+            - (O) "The giraffe is tilting its head and sniffing the ground."
+            - (X) "The giraffe is near a tree and looking around."
+            4. **Avoid assumptions, emotions, or speculative phrasing.**
+            - (X) "The person seems excited" / "The person might be preparing to jump."
+            - (O) "The person is pushing its front legs against the rock and leaping forward."
+            5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                - expressions like 'seems to be', 'appears to be' are BANNED!
+            6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+            7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
+            - **Each object should have a unique, descriptive action.**
+            - (X) "Two dogs are running."
+            - (O) "1. One dog is chasing another, its legs stretched mid-air.
+                            2. The other dog is looking back while speeding up."
+            ---
+            ## Output Format:
+            - Each labeled **{cat_name}** should have exactly **one line of description**.
+            - Format: `ID. {cat_name} + action-based description`
+            - (O) Example:
+                ```
+                1. The person is leaning forward while opening a bag with both hands.
+                2. The person is holding onto a rope and pulling themselves up.
+                ```
+            - **Ensure that each object is described individually.**
+            - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
+            ---
+            ## Additional Instructions:
+            - **Do NOT** use expressions like "it appears that..." or "it seems like...".
+            - **Do NOT** mention object IDs in the description (only use the provided format).
+            - **DO NOT** include markdown formatting (no bullet points, no asterisks).
+            - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
+            Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
+            """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
+            I am building an **action-centric referring expression** dataset.
+            Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
+            ---
+            ## Guidelines:
+            1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
+            2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
+            3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
+            4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+            5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
+            6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
+            7. Base your descriptions on these principles:
+            - **Avoid words like 'minimal' or 'slightly'.**
+            - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
+            - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
+            - **Specify actions with other objects or entities** only when they are clear and observable.
+                - (O) "pushing another person"
+                - (X) "interacting with another object"
+            ---
+            ## Output Format:
+            - Each labeled **{cat_name}** must have **exactly one line**.
+            - Format: `ID. {cat_name} + action-based description`
+            - (O) Example:
+                ```
+                1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
+                2. The person is pulling a baby carriage while smiling.
+                ```
+            - **Ensure each object is described individually.**
+            - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
+            ---
+            ## Example:
+            If the frame has two labeled **bears**, your output should be:
+            ```
+            1. The bear is reaching out its right paw while leaning forward to catch prey.
+            2. A bear is standing upright, facing right, and touching the bike beside it.
+            ```
+            ---
+            ## Additional Instructions:
+            - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
+            - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
+            - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
+            - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
+            Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
+            MAX_RETRIES = 3
+            retry_count = 0
+            if should_caption:
+                while retry_count < MAX_RETRIES:
+                    selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
+                    response2 = captioner.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": selected_prompt,
+                                    },
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+                    # caption = response2.choices[0].message.content
+                    #print(f"{image_path} - {frame_name}: {caption}")
+                    caption = response2.choices[0].message.content.strip()
+                    caption_lower = caption.lower().lstrip()
+                    if caption_lower.startswith("1.") and not any(
+                        phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+                    ):
+                        break
+                    print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+                    retry_count += 1
+                    time.sleep(2)
+                if retry_count == MAX_RETRIES:
+                    caption = None
+                    print("Max retries reached. Caption generation failed.")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return vid_id, all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(len(metas)):
+        try:
+            vid_id, all_captions, valid_obj_ids = getCaption(i)
+            if vid_id not in result_captions:
+                result_captions[vid_id] = all_captions
+            if vid_id not in result_valid_obj_ids:
+                result_valid_obj_ids[vid_id] = valid_obj_ids
+        except (requests.exceptions.ConnectionError, APIConnectionError) as e:
+            print(f"created caption until {i}", flush=True)
+            with open(args.save_caption_path, "w") as file:
+                json.dump(result_captions, file, indent=4)
+            with open(args.save_valid_obj_ids_path, "w") as file:
+                json.dump(result_valid_obj_ids, file, indent=4)
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

.history/mbench/make_ref-ytvos_json_20250113182322.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from datasets import build_dataset
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    data_idx = 0
+    while data_idx < 10:
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+        while metas[data_idx]['video'] == video_id:
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+            frames = metas[data_idx]['frames']
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+            annotation_data.append(obj_data)
+            frame_names.append(frame_name)
+            data_idx += 1
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250113182734.py ADDED Viewed

	@@ -0,0 +1,102 @@

+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    data_idx = 0
+    while data_idx < 10:
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+        while metas[data_idx]['video'] == video_id:
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+            frames = metas[data_idx]['frames']
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+            annotation_data.append(obj_data)
+            frame_names.append(frame_name)
+            data_idx += 1
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250113182817.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import os
+from os import path as osp
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    data_idx = 0
+    while data_idx < 10:
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+        while metas[data_idx]['video'] == video_id:
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+            frames = metas[data_idx]['frames']
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+            annotation_data.append(obj_data)
+            frame_names.append(frame_name)
+            data_idx += 1
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250113182842.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import sys
+from os import path as osp
+sys.path.append(os.path.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    data_idx = 0
+    while data_idx < 10:
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+        while metas[data_idx]['video'] == video_id:
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+            frames = metas[data_idx]['frames']
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+            annotation_data.append(obj_data)
+            frame_names.append(frame_name)
+            data_idx += 1
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250113183130.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    data_idx = 0
+    while data_idx < 10:
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+        while metas[data_idx]['video'] == video_id:
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+            frames = metas[data_idx]['frames']
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+            annotation_data.append(obj_data)
+            frame_names.append(frame_name)
+            data_idx += 1
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250116141513.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    vid_idx = 0
+    while vid_idx < len(train_dataset):
+        #하나의 비디오에 대해
+        video_data = {}
+        video_train_frames, video_train_info = train_dataset[vid_idx]
+        video_meta = metas[vid_idx]
+        video_id = video_meta['video']
+        video_data['bins'] = video_meta['bins']
+        bin_nums = len(video_meta['bins'])
+        obj_nums = len(list(video_meta['obj_id_cat'].keys()))
+        annotation_data = []
+        frame_names = []
+        for i in range(bin_nums):
+            bin_data = {}
+            for j in range(obj_nums):
+                obj_id = str(j+1)
+                obj_data = {
+                    "category_name":video_meta['obj_id_cat'][obj_id],
+                    "bbox":video_train_info['boxes'][i*obj_nums+j, :]
+                }
+                bin_data[obj_id] = obj_data
+            annotation_data.append(bin_data)
+        video_data['annotations'] = annotation_data
+        sample_indx = metas[vid_idx]['sample_indx']
+        frames = metas[vid_idx]['frames']
+        for i in sample_indx:
+            frame_name = frames[i]
+            frame_names.append(frame_name)
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+        vid_idx += 1
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    print(type(entire_json_dict))
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250118024325.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import sys
+import os
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    vid_idx = 0
+    while vid_idx < len(train_dataset):
+        #하나의 비디오에 대해
+        video_data = {}
+        video_train_frames, video_train_info = train_dataset[vid_idx]
+        video_meta = metas[vid_idx]
+        video_id = video_meta['video']
+        video_data['bins'] = video_meta['bins']
+        bin_nums = len(video_meta['bins'])
+        obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())])
+        annotation_data = []
+        frame_names = []
+        for i in range(bin_nums):
+            bin_data = {}
+            for j in range(obj_nums):
+                obj_id = str(j+1)
+                try:
+                    obj_data = {
+                        "category_name":video_meta['obj_id_cat'][obj_id],
+                        "bbox":video_train_info['boxes'][i*obj_nums+j, :].tolist(),
+                        "valid":video_train_info['valid'][i*obj_nums+j].item()
+                    }
+                except:
+                    obj_data = {}
+                bin_data[obj_id] = obj_data
+            annotation_data.append(bin_data)
+        video_data['annotations'] = annotation_data
+        sample_indx = metas[vid_idx]['sample_indx']
+        frames = metas[vid_idx]['frames']
+        for i in sample_indx:
+            frame_name = frames[i]
+            frame_names.append(frame_name)
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+        vid_idx += 1
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    print(type(entire_json_dict))
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame2.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/ytvos_ref_20250121152309.py ADDED Viewed

	@@ -0,0 +1,264 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.utils.data import Dataset
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+# from datasets.categories import ytvos_category_dict as category_dict
+category_dict = {
+    'airplane': 0, 'ape': 1, 'bear': 2, 'bike': 3, 'bird': 4, 'boat': 5, 'bucket': 6, 'bus': 7, 'camel': 8, 'cat': 9,
+    'cow': 10, 'crocodile': 11, 'deer': 12, 'dog': 13, 'dolphin': 14, 'duck': 15, 'eagle': 16, 'earless_seal': 17,
+    'elephant': 18, 'fish': 19, 'fox': 20, 'frisbee': 21, 'frog': 22, 'giant_panda': 23, 'giraffe': 24, 'hand': 25,
+    'hat': 26, 'hedgehog': 27, 'horse': 28, 'knife': 29, 'leopard': 30, 'lion': 31, 'lizard': 32, 'monkey': 33,
+    'motorbike': 34, 'mouse': 35, 'others': 36, 'owl': 37, 'paddle': 38, 'parachute': 39, 'parrot': 40, 'penguin': 41,
+    'person': 42, 'plant': 43, 'rabbit': 44, 'raccoon': 45, 'sedan': 46, 'shark': 47, 'sheep': 48, 'sign': 49,
+    'skateboard': 50, 'snail': 51, 'snake': 52, 'snowboard': 53, 'squirrel': 54, 'surfboard': 55, 'tennis_racket': 56,
+    'tiger': 57, 'toilet': 58, 'train': 59, 'truck': 60, 'turtle': 61, 'umbrella': 62, 'whale': 63, 'zebra': 64
+}
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        meta = self.metas[idx]  # dict
+        video, sample_indx, bins, frames, obj_id_cat = \
+            meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+        # read frames and masks
+        annos = {}
+        imgs, labels, boxes, masks, valid = [], [], [], [], []
+        for frame_indx in sample_indx:
+            frame_name = frames[frame_indx]
+            img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+            mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+            img = Image.open(img_path).convert('RGB')
+            imgs.append(img)
+            mask = Image.open(mask_path).convert('P')
+            mask = np.array(mask)
+            frame_annotations = {}
+            # create the target
+            for obj_id in list(obj_id_cat.keys()):
+                obj_mask = (mask==int(obj_id)).astype(np.float32) # 0,1 binary
+                if (obj_mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(obj_mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                    val = 1
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                    val = 0
+                obj_mask = torch.from_numpy(obj_mask)
+                # append
+                masks.append(obj_mask)
+                boxes.append(box)
+                frame_annotations[obj_id] = {
+                    'category_name': obj_id_cat[obj_id],
+                    'bbox': box,
+                    'valid' : val,
+                    'mask': obj_mask
+                }
+            annos[frame_indx] = frame_annotations
+        # transform
+        w, h = img.size
+        boxes = torch.stack(boxes, dim=0)
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        masks = torch.stack(masks, dim=0)
+        target = {
+            'frames_idx': sample_indx, # [T,]
+            'boxes': boxes,                          # [T, 4], xyxy
+            'masks': masks,                          # [T, H, W]
+            'valid': torch.tensor(valid),            # [T,]
+            'obj_ids' : list(obj_id_cat.keys()),
+            'orig_size': torch.as_tensor([int(h), int(w)]),
+            'size': torch.as_tensor([int(h), int(w)])
+        }
+        # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+        # if self._transforms:
+        #     imgs, target = self._transforms(imgs, target)
+        #     imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+        # else:
+        imgs = np.array(imgs)
+        imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+        #  # FIXME: handle "valid", since some box may be removed due to random crop
+        # if torch.any(target['valid'] == 1):  # at leatst one instance
+        #     instance_check = True
+        # else:
+        #     idx = random.randint(0, self.__len__() - 1)
+        return imgs, target, annos
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/mbench_a2d/gpt_a2d_numbered_20250205111640.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from datasets import build_dataset
+import argparse
+import opts
+import sys
+import os
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+from io import BytesIO
+import base64
+from PIL import Image
+from openai import OpenAI
+def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
+    #마스크 색칠할지
+    if color_mask == True:
+        alpha = 0.1
+        colored_mask = np.zeros_like(frame)
+        colored_mask[mask == 1] = [255, 0, 0]
+        frame[mask == 1] = (
+            (1 - alpha) * frame[mask == 1] +
+            alpha * colored_mask[mask == 1]
+        )
+    #마스크 아웃라인 그리기
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
+    #instance_id 적을지
+    if label_number == True:
+        if len(contours) > 0:
+            largest_contour = max(contours, key=cv2.contourArea)
+            M = cv2.moments(largest_contour)
+            if M["m00"] != 0:
+                center_x = int(M["m10"] / M["m00"])
+                center_y = int(M["m01"] / M["m00"])
+            else:
+                center_x, center_y = 0, 0
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            text = str(instance_id)
+            font_scale = 0.6
+            text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+            text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+            text_y = center_y
+            # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+            # 텍스트 배경 사각형 좌표 계산
+            rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
+            # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+            rect_end = (text_x + text_size[0] + 5, text_y)
+            cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+            cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
+    # plt.figure(figsize=(6, 10))
+    # plt.imshow(frame)
+    # plt.title(text_query)
+    # plt.tight_layout()
+    # plt.axis('off')
+    # plt.show()
+    buffer = BytesIO()
+    frame = Image.fromarray(frame)
+    frame.save(buffer, format='jpeg')
+    buffer.seek(0)
+    encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
+    return encoded_frame
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    train_dataset = build_dataset('a2d', image_set = 'train', args = args)
+    text_annotations = train_dataset.text_annotations

.history/mbench_a2d/gpt_a2d_numbered_20250205122340.py ADDED Viewed

	@@ -0,0 +1,196 @@

+from datasets import build_dataset
+import argparse
+import opts
+import sys
+import os
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+from io import BytesIO
+import base64
+from PIL import Image
+import json
+from openai import OpenAI
+def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
+    #마스크 색칠할지
+    if color_mask == True:
+        alpha = 0.1
+        colored_mask = np.zeros_like(frame)
+        colored_mask[mask == 1] = [255, 0, 0]
+        frame[mask == 1] = (
+            (1 - alpha) * frame[mask == 1] +
+            alpha * colored_mask[mask == 1]
+        )
+    #마스크 아웃라인 그리기
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
+    #instance_id 적을지
+    if label_number == True:
+        if len(contours) > 0:
+            largest_contour = max(contours, key=cv2.contourArea)
+            M = cv2.moments(largest_contour)
+            if M["m00"] != 0:
+                center_x = int(M["m10"] / M["m00"])
+                center_y = int(M["m01"] / M["m00"])
+            else:
+                center_x, center_y = 0, 0
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            text = str(instance_id)
+            font_scale = 0.6
+            text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+            text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+            text_y = center_y
+            # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+            # 텍스트 배경 사각형 좌표 계산
+            rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
+            # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+            rect_end = (text_x + text_size[0] + 5, text_y)
+            cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+            cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
+    # plt.figure(figsize=(6, 10))
+    # plt.imshow(frame)
+    # plt.title(text_query)
+    # plt.tight_layout()
+    # plt.axis('off')
+    # plt.show()
+    buffer = BytesIO()
+    frame = Image.fromarray(frame)
+    frame.save(buffer, format='jpeg')
+    buffer.seek(0)
+    encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
+    return encoded_frame
+def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
+    base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
+    captioner = OpenAI()
+    #필터링하지 않고 바로 ref exp 만들기
+    dense_caption_prompt = f"""
+    You are a visual assistant analyzing a single frame of a video.
+    In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
+    I also give you a text query describing the marked object.
+    I want to use your expression to create an **action-centric referring expression** dataset.
+    Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
+    ---
+    ## Guidelines:
+    1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
+    2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
+    3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
+    4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+    5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+    6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
+    7. Base your description on these action definitions:
+    - Avoid using term 'minimal' or 'slightly'.
+    - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
+    - details such as motion and intention, facial with object manipulation
+    - movements with object or other entities when they are prominent and observable. expression should be specific.
+        (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
+    --
+    ## Output Format:
+    - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
+        object id. action-oriented description
+        (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
+    ### Example
+    If the frame has 1 labeled bear, your output should look like:
+    1. the bear reaching his right arm while leaning forward to capture the prey
+    ---
+    **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
+    **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
+    **Do not include markdown** in the output.
+    Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+    For each labeled object, output referring expressions for each object id.
+    """
+    prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
+    MAX_RETRIES = 2
+    retry_count = 0
+    while retry_count < MAX_RETRIES:
+        response = captioner.chat.completions.create(
+            model=model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt_with_text_query,
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        caption = response.choices[0].message.content.strip()
+        caption_lower = caption.lower().lstrip()
+        if caption_lower.startswith("1.") and not any(
+            phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+        ):
+            break
+        print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+        retry_count += 1
+        time.sleep(2)
+        if retry_count == MAX_RETRIES:
+            caption = None
+            print("Max retries reached. Caption generation failed.")
+    else:
+        caption = None
+    return caption
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
+    args = parser.parse_args()
+    train_dataset = build_dataset('a2d', image_set = 'train', args = args)
+    text_annotations = train_dataset.text_annotations
+    all_captions = {}
+    os.environ['OPENAI_API_KEY'] = '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'
+    for idx in range(100):
+        imgs, target = train_dataset[idx]
+        frames_idx = target['frames_idx'].tolist()
+        text_query, vid_id, frame_id, instance_id = text_annotations[idx]
+        frame_id = frame_id - 1
+        frame_order = frames_idx.index(frame_id)
+        frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
+        mask = target['masks'].numpy().astype(np.uint8).squeeze()
+        caption = getCaption(frame, mask, instance_id, text_query)
+        if vid_id not in all_captions:
+            all_captions[vid_id] = {frame_id : caption}
+        else:
+            all_captions[vid_id][frame_id] = caption
+    with open(args.save_caption_path, 'w') as file:
+        json.dump(all_captions, file, indent=4)

.history/mbench_a2d/gpt_a2d_numbered_20250205152326.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+from io import BytesIO
+import base64
+from PIL import Image
+import json
+from openai import OpenAI
+def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
+    #마스크 색칠할지
+    if color_mask == True:
+        alpha = 0.1
+        colored_mask = np.zeros_like(frame)
+        colored_mask[mask == 1] = [255, 0, 0]
+        frame[mask == 1] = (
+            (1 - alpha) * frame[mask == 1] +
+            alpha * colored_mask[mask == 1]
+        )
+    #마스크 아웃라인 그리기
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
+    #instance_id 적을지
+    if label_number == True:
+        if len(contours) > 0:
+            largest_contour = max(contours, key=cv2.contourArea)
+            M = cv2.moments(largest_contour)
+            if M["m00"] != 0:
+                center_x = int(M["m10"] / M["m00"])
+                center_y = int(M["m01"] / M["m00"])
+            else:
+                center_x, center_y = 0, 0
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            text = str(instance_id)
+            font_scale = 0.6
+            text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+            text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+            text_y = center_y
+            # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+            # 텍스트 배경 사각형 좌표 계산
+            rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
+            # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+            rect_end = (text_x + text_size[0] + 5, text_y)
+            cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+            cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
+    # plt.figure(figsize=(6, 10))
+    # plt.imshow(frame)
+    # plt.title(text_query)
+    # plt.tight_layout()
+    # plt.axis('off')
+    # plt.show()
+    buffer = BytesIO()
+    frame = Image.fromarray(frame)
+    frame.save(buffer, format='jpeg')
+    buffer.seek(0)
+    encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
+    return encoded_frame
+def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
+    base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
+    captioner = OpenAI()
+    #필터링하지 않고 바로 ref exp 만들기
+    dense_caption_prompt = f"""
+    You are a visual assistant analyzing a single frame of a video.
+    In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
+    I also give you a text query describing the marked object.
+    I want to use your expression to create an **action-centric referring expression** dataset.
+    Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
+    ---
+    ## Guidelines:
+    1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
+    2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
+    3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
+    4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+    5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+    6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
+    7. Base your description on these action definitions:
+    - Avoid using term 'minimal' or 'slightly'.
+    - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
+    - details such as motion and intention, facial with object manipulation
+    - movements with object or other entities when they are prominent and observable. expression should be specific.
+        (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
+    --
+    ## Output Format:
+    - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
+        object id. action-oriented description
+        (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
+    ### Example
+    If the frame has 1 labeled bear, your output should look like:
+    1. the bear reaching his right arm while leaning forward to capture the prey
+    ---
+    **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
+    **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
+    **Do not include markdown** in the output.
+    Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+    For each labeled object, output referring expressions for each object id.
+    """
+    prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
+    MAX_RETRIES = 2
+    retry_count = 0
+    while retry_count < MAX_RETRIES:
+        response = captioner.chat.completions.create(
+            model=model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt_with_text_query,
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        caption = response.choices[0].message.content.strip()
+        caption_lower = caption.lower().lstrip()
+        if caption_lower.startswith("1.") and not any(
+            phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+        ):
+            break
+        print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+        retry_count += 1
+        time.sleep(2)
+        if retry_count == MAX_RETRIES:
+            caption = None
+            print("Max retries reached. Caption generation failed.")
+    else:
+        caption = None
+    return caption
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
+    args = parser.parse_args()
+    train_dataset = build_dataset('a2d', image_set = 'train', args = args)
+    text_annotations = train_dataset.text_annotations
+    all_captions = {}
+    #os.environ['OPENAI_API_KEY'] = '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'
+    os.environ['OPENAI_API_KEY'] = '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'
+    for idx in range(100):
+        imgs, target = train_dataset[idx]
+        frames_idx = target['frames_idx'].tolist()
+        text_query, vid_id, frame_id, instance_id = text_annotations[idx]
+        print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True)
+        frame_id = frame_id - 1
+        frame_order = frames_idx.index(frame_id)
+        frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
+        mask = target['masks'].numpy().astype(np.uint8).squeeze()
+        caption = getCaption(frame, mask, instance_id, text_query)
+        if vid_id not in all_captions:
+            all_captions[vid_id] = {frame_id : caption}
+        else:
+            all_captions[vid_id][frame_id] = caption
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, 'w') as file:
+        json.dump(all_captions, file, indent=4)

.history/mbench_a2d/gpt_a2d_numbered_20250207110257.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+from io import BytesIO
+import base64
+from PIL import Image
+import json
+from openai import OpenAI
+def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
+    #마스크 색칠할지
+    if color_mask == True:
+        alpha = 0.1
+        colored_mask = np.zeros_like(frame)
+        colored_mask[mask == 1] = [255, 0, 0]
+        frame[mask == 1] = (
+            (1 - alpha) * frame[mask == 1] +
+            alpha * colored_mask[mask == 1]
+        )
+    #마스크 아웃라인 그리기
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
+    #instance_id 적을지
+    if label_number == True:
+        if len(contours) > 0:
+            largest_contour = max(contours, key=cv2.contourArea)
+            M = cv2.moments(largest_contour)
+            if M["m00"] != 0:
+                center_x = int(M["m10"] / M["m00"])
+                center_y = int(M["m01"] / M["m00"])
+            else:
+                center_x, center_y = 0, 0
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            text = str(instance_id)
+            font_scale = 0.6
+            text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+            text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+            text_y = center_y
+            # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+            # 텍스트 배경 사각형 좌표 계산
+            rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
+            # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+            rect_end = (text_x + text_size[0] + 5, text_y)
+            cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+            cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
+    # plt.figure(figsize=(6, 10))
+    # plt.imshow(frame)
+    # plt.title(text_query)
+    # plt.tight_layout()
+    # plt.axis('off')
+    # plt.show()
+    buffer = BytesIO()
+    frame = Image.fromarray(frame)
+    frame.save(buffer, format='jpeg')
+    buffer.seek(0)
+    encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
+    return encoded_frame
+def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
+    base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
+    captioner = OpenAI()
+    #필터링하지 않고 바로 ref exp 만들기
+    dense_caption_prompt = f"""
+    You are a visual assistant analyzing a single frame of a video.
+    In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
+    I also give you a text query describing the marked object.
+    I want to use your expression to create an **action-centric referring expression** dataset.
+    Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
+    ---
+    ## Guidelines:
+    1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
+    2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
+    3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
+    4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+    5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+    6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
+    7. Base your description on these action definitions:
+    - Avoid using term 'minimal' or 'slightly'.
+    - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
+    - details such as motion and intention, facial with object manipulation
+    - movements with object or other entities when they are prominent and observable. expression should be specific.
+        (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
+    --
+    ## Output Format:
+    - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
+        object id. action-oriented description
+        (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
+    ### Example
+    If the frame has 1 labeled bear, your output should look like:
+    1. the bear reaching his right arm while leaning forward to capture the prey
+    ---
+    **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
+    **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
+    **Do not include markdown** in the output.
+    Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+    For each labeled object, output referring expressions for each object id.
+    """
+    prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
+    MAX_RETRIES = 2
+    retry_count = 0
+    while retry_count < MAX_RETRIES:
+        response = captioner.chat.completions.create(
+            model=model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt_with_text_query,
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        caption = response.choices[0].message.content.strip()
+        caption_lower = caption.lower().lstrip()
+        if caption_lower.startswith("1.") and not any(
+            phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+        ):
+            break
+        print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+        retry_count += 1
+        time.sleep(2)
+        if retry_count == MAX_RETRIES:
+            caption = None
+            print("Max retries reached. Caption generation failed.")
+    else:
+        caption = None
+    return caption
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
+    args = parser.parse_args()
+    train_dataset = build_dataset('a2d', image_set = 'train', args = args)
+    text_annotations = train_dataset.text_annotations
+    all_captions = {}
+    #os.environ['OPENAI_API_KEY'] = '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'
+    os.environ['OPENAI_API_KEY'] = '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'
+    first_text_query = ""
+    for idx in range(300):
+        imgs, target = train_dataset[idx]
+        frames_idx = target['frames_idx'].tolist()
+        text_query, vid_id, frame_id, instance_id = text_annotations[idx]
+        if text_query == first_text_query:
+            continue
+        print(f"------------vid id: {vid_id}, frame id: {frame_id}, instance id: {instance_id}", flush=True)
+        frame_id = frame_id - 1
+        frame_order = frames_idx.index(frame_id)
+        frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
+        mask = target['masks'].numpy().astype(np.uint8).squeeze()
+        caption = getCaption(frame, mask, instance_id, text_query, model='gpt-4o-mini')
+        if vid_id in all_captions:
+            if frame_id in all_captions[vid_id]:
+                all_captions[vid_id][frame_id][instance_id] = caption
+            else:
+                all_captions[vid_id][frame_id] = {instance_id : caption}
+        else:
+            all_captions[vid_id] = {frame_id : {instance_id: caption}}
+        if idx % 50 == 0:
+            with open(args.save_caption_path, 'w') as file:
+                json.dump(all_captions, file, indent=4)
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, 'w') as file:
+                json.dump(all_captions, file, indent=4)

.history/slurm_script/jupyter_20250121151552.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/bin/bash
+#SBATCH --job-name=jupyter
+#SBATCH --partition=a5000
+#SBATCH --nodelist=node04
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ srun jupyter notebook --no-browser --port=7890

.history/slurm_script/jupyter_20250121151643.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/bin/bash
+#SBATCH --job-name=jupyter
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ srun jupyter notebook --no-browser --port=7890

.history/slurm_script/mbench_gpt_a2d_20250205122515.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_a2d
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_a2d.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/gpt_ref-ytvos_numbered_cy.py \
+    --save_caption_path mbench_a2d/numbered_captions.json

.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121155940.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_ref-ytvos_revised
+#SBATCH --partition=a5000
+#SBATCH --nodelist=node04
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_revised.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/gpt_ref-ytvos_revised.py

.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121160841.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_ref-ytvos_revised50
+#SBATCH --partition=a5000
+#SBATCH --nodelist=node04
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_revised50.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/gpt_ref-ytvos_revised.py

.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250124085144.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_ref-ytvos_revised50
+#SBATCH --partition=a5000
+#SBATCH --nodelist=node04
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_revised50.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/gpt_ref-ytvos-revised.py

.history/slurm_script/mbench_gpt_ref-ytvos_20250119070944.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_ref-ytvos
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/gpt_ref-ytvos.py

.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130190228.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/gpt_ref-ytvos_numbered_cy.py \
+    --save_caption_path mbench/numbered_captions.json \
+    --save_valid_obj_ids_path mbench/numbered_valid_obj_ids.json

.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250201140706.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/gpt_ref-ytvos_numbered_cy.py \
+    --save_caption_path mbench/numbered_captions_gpt-4o.json \
+    --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o.json

.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250202183206.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/gpt_ref-ytvos_numbered_cy.py \
+    --save_caption_path mbench/numbered_captions_gpt-4o_no_mask_color.json \
+    --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o_no_mask_color.json

.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207171604.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered_final
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered_final.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/gpt_ref-ytvos_numbered_cy_sanity_2.py \
+    --save_caption_path mbench/numbered_captions_gpt-4o_final.json \
+    --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o_final.json

.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207172920.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered_final
+#SBATCH --partition=a5000
+#SBATCH --nodelist=node04
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered_final.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/gpt_ref-ytvos_numbered_cy_sanity_2.py \
+    --save_caption_path mbench/numbered_captions_gpt-4o_final.json \
+    --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o_final.json

hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/ca26d90c9e8e071d0bc31b570aef68306d0be1db4330471d10a117061a15a991.lock ADDED Viewed

File without changes

hf_cache/models--zhiqiulin--clip-flant5-xxl/.no_exist/89bad6fffe1126b24d4360c1e1f69145eb6103aa/pytorch_model.bin ADDED Viewed

File without changes

hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b
+size 9999791010