| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | """ |
| | MOT dataset which returns image_id for evaluation. |
| | """ |
| | from collections import defaultdict |
| | import json |
| | import os |
| | from pathlib import Path |
| | import cv2 |
| | import numpy as np |
| | import torch |
| | import torch.utils.data |
| | import os.path as osp |
| | from PIL import Image, ImageDraw |
| | import copy |
| | import datasets.transforms as T |
| | from models.structures import Instances |
| |
|
| | from random import choice, randint |
| |
|
| |
|
| | def is_crowd(ann): |
| | return 'extra' in ann and 'ignore' in ann['extra'] and ann['extra']['ignore'] == 1 |
| |
|
| |
|
| | class DetMOTDetection: |
| | def __init__(self, args, data_txt_path: str, seqs_folder, transform): |
| | self.args = args |
| | self.transform = transform |
| | self.num_frames_per_batch = max(args.sampler_lengths) |
| | self.sample_mode = args.sample_mode |
| | self.sample_interval = args.sample_interval |
| | self.video_dict = {} |
| | self.mot_path = args.mot_path |
| |
|
| | self.labels_full = defaultdict(lambda : defaultdict(list)) |
| | def add_mot_folder(split_dir): |
| | print("Adding", split_dir) |
| | for vid in os.listdir(os.path.join(self.mot_path, split_dir)): |
| | if 'seqmap' == vid: |
| | continue |
| | vid = os.path.join(split_dir, vid) |
| | if 'DPM' in vid or 'FRCNN' in vid: |
| | print(f'filter {vid}') |
| | continue |
| | gt_path = os.path.join(self.mot_path, vid, 'gt', 'gt.txt') |
| | for l in open(gt_path): |
| | t, i, *xywh, mark, label = l.strip().split(',')[:8] |
| | t, i, mark, label = map(int, (t, i, mark, label)) |
| | if mark == 0: |
| | continue |
| | if label in [3, 4, 5, 6, 9, 10, 11]: |
| | continue |
| | else: |
| | crowd = False |
| | x, y, w, h = map(float, (xywh)) |
| | self.labels_full[vid][t].append([x, y, w, h, i, crowd]) |
| |
|
| | add_mot_folder("DanceTrack/train") |
| | vid_files = list(self.labels_full.keys()) |
| |
|
| | self.indices = [] |
| | self.vid_tmax = {} |
| | for vid in vid_files: |
| | self.video_dict[vid] = len(self.video_dict) |
| | t_min = min(self.labels_full[vid].keys()) |
| | t_max = max(self.labels_full[vid].keys()) + 1 |
| | self.vid_tmax[vid] = t_max - 1 |
| | for t in range(t_min, t_max - self.num_frames_per_batch): |
| | self.indices.append((vid, t)) |
| | print(f"Found {len(vid_files)} videos, {len(self.indices)} frames") |
| |
|
| | self.sampler_steps: list = args.sampler_steps |
| | self.lengths: list = args.sampler_lengths |
| | print("sampler_steps={} lenghts={}".format(self.sampler_steps, self.lengths)) |
| | self.period_idx = 0 |
| |
|
| | |
| | self.ch_dir = Path(args.mot_path) / 'crowdhuman' |
| | self.ch_indices = [] |
| | if args.append_crowd: |
| | for line in open(self.ch_dir / f"annotation_trainval.odgt"): |
| | datum = json.loads(line) |
| | boxes = [ann['fbox'] for ann in datum['gtboxes'] if not is_crowd(ann)] |
| | self.ch_indices.append((datum['ID'], boxes)) |
| | |
| | print(f"Found {len(self.ch_indices)} images") |
| |
|
| | if args.det_db: |
| | with open(os.path.join(args.mot_path, args.det_db)) as f: |
| | self.det_db = json.load(f) |
| | else: |
| | self.det_db = defaultdict(list) |
| |
|
| | def set_epoch(self, epoch): |
| | self.current_epoch = epoch |
| | if self.sampler_steps is None or len(self.sampler_steps) == 0: |
| | |
| | return |
| |
|
| | for i in range(len(self.sampler_steps)): |
| | if epoch >= self.sampler_steps[i]: |
| | self.period_idx = i + 1 |
| | print("set epoch: epoch {} period_idx={}".format(epoch, self.period_idx)) |
| | self.num_frames_per_batch = self.lengths[self.period_idx] |
| |
|
| | def step_epoch(self): |
| | |
| | print("Dataset: epoch {} finishes".format(self.current_epoch)) |
| | self.set_epoch(self.current_epoch + 1) |
| |
|
| | @staticmethod |
| | def _targets_to_instances(targets: dict, img_shape) -> Instances: |
| | gt_instances = Instances(tuple(img_shape)) |
| | n_gt = len(targets['labels']) |
| | gt_instances.boxes = targets['boxes'][:n_gt] |
| | gt_instances.labels = targets['labels'] |
| | gt_instances.obj_ids = targets['obj_ids'] |
| | return gt_instances |
| |
|
| | def load_crowd(self, index): |
| | ID, boxes = self.ch_indices[index] |
| | boxes = copy.deepcopy(boxes) |
| | img = Image.open(self.ch_dir / 'Images' / f'{ID}.jpg') |
| |
|
| | w, h = img._size |
| | n_gts = len(boxes) |
| | scores = [0. for _ in range(len(boxes))] |
| | for line in self.det_db[f'crowdhuman/train_image/{ID}.txt']: |
| | *box, s = map(float, line.split(',')) |
| | boxes.append(box) |
| | scores.append(s) |
| | boxes = torch.tensor(boxes, dtype=torch.float32) |
| | areas = boxes[..., 2:].prod(-1) |
| | boxes[:, 2:] += boxes[:, :2] |
| |
|
| | target = { |
| | 'boxes': boxes, |
| | 'scores': torch.as_tensor(scores), |
| | 'labels': torch.zeros((n_gts, ), dtype=torch.long), |
| | 'iscrowd': torch.zeros((n_gts, ), dtype=torch.bool), |
| | 'image_id': torch.tensor([0]), |
| | 'area': areas, |
| | 'obj_ids': torch.arange(n_gts), |
| | 'size': torch.as_tensor([h, w]), |
| | 'orig_size': torch.as_tensor([h, w]), |
| | 'dataset': "CrowdHuman", |
| | } |
| | rs = T.FixedMotRandomShift(self.num_frames_per_batch) |
| | return rs([img], [target]) |
| |
|
| | def _pre_single_frame(self, vid, idx: int): |
| | img_path = os.path.join(self.mot_path, vid, 'img1', f'{idx:08d}.jpg') |
| | img = Image.open(img_path) |
| | targets = {} |
| | w, h = img._size |
| | assert w > 0 and h > 0, "invalid image {} with shape {} {}".format(img_path, w, h) |
| | obj_idx_offset = self.video_dict[vid] * 100000 |
| |
|
| | targets['dataset'] = 'MOT17' |
| | targets['boxes'] = [] |
| | targets['iscrowd'] = [] |
| | targets['labels'] = [] |
| | targets['obj_ids'] = [] |
| | targets['scores'] = [] |
| | targets['image_id'] = torch.as_tensor(idx) |
| | targets['size'] = torch.as_tensor([h, w]) |
| | targets['orig_size'] = torch.as_tensor([h, w]) |
| | for *xywh, id, crowd in self.labels_full[vid][idx]: |
| | targets['boxes'].append(xywh) |
| | assert not crowd |
| | targets['iscrowd'].append(crowd) |
| | targets['labels'].append(0) |
| | targets['obj_ids'].append(id + obj_idx_offset) |
| | targets['scores'].append(1.) |
| | txt_key = os.path.join(vid, 'img1', f'{idx:08d}.txt') |
| | for line in self.det_db[txt_key]: |
| | *box, s = map(float, line.split(',')) |
| | targets['boxes'].append(box) |
| | targets['scores'].append(s) |
| |
|
| | targets['iscrowd'] = torch.as_tensor(targets['iscrowd']) |
| | targets['labels'] = torch.as_tensor(targets['labels']) |
| | targets['obj_ids'] = torch.as_tensor(targets['obj_ids'], dtype=torch.float64) |
| | targets['scores'] = torch.as_tensor(targets['scores']) |
| | targets['boxes'] = torch.as_tensor(targets['boxes'], dtype=torch.float32).reshape(-1, 4) |
| | targets['boxes'][:, 2:] += targets['boxes'][:, :2] |
| | return img, targets |
| |
|
| | def _get_sample_range(self, start_idx): |
| |
|
| | |
| | assert self.sample_mode in ['fixed_interval', 'random_interval'], 'invalid sample mode: {}'.format(self.sample_mode) |
| | if self.sample_mode == 'fixed_interval': |
| | sample_interval = self.sample_interval |
| | elif self.sample_mode == 'random_interval': |
| | sample_interval = np.random.randint(1, self.sample_interval + 1) |
| | default_range = start_idx, start_idx + (self.num_frames_per_batch - 1) * sample_interval + 1, sample_interval |
| | return default_range |
| |
|
| | def pre_continuous_frames(self, vid, indices): |
| | return zip(*[self._pre_single_frame(vid, i) for i in indices]) |
| |
|
| | def sample_indices(self, vid, f_index): |
| | assert self.sample_mode == 'random_interval' |
| | rate = randint(1, self.sample_interval + 1) |
| | tmax = self.vid_tmax[vid] |
| | ids = [f_index + rate * i for i in range(self.num_frames_per_batch)] |
| | return [min(i, tmax) for i in ids] |
| |
|
| | def __getitem__(self, idx): |
| | if idx < len(self.indices): |
| | vid, f_index = self.indices[idx] |
| | indices = self.sample_indices(vid, f_index) |
| | images, targets = self.pre_continuous_frames(vid, indices) |
| | else: |
| | images, targets = self.load_crowd(idx - len(self.indices)) |
| | if self.transform is not None: |
| | images, targets = self.transform(images, targets) |
| | gt_instances, proposals = [], [] |
| | for img_i, targets_i in zip(images, targets): |
| | gt_instances_i = self._targets_to_instances(targets_i, img_i.shape[1:3]) |
| | gt_instances.append(gt_instances_i) |
| | n_gt = len(targets_i['labels']) |
| | proposals.append(torch.cat([ |
| | targets_i['boxes'][n_gt:], |
| | targets_i['scores'][n_gt:, None], |
| | ], dim=1)) |
| | return { |
| | 'imgs': images, |
| | 'gt_instances': gt_instances, |
| | 'proposals': proposals, |
| | } |
| |
|
| | def __len__(self): |
| | return len(self.indices) + len(self.ch_indices) |
| |
|
| |
|
| | class DetMOTDetectionValidation(DetMOTDetection): |
| | def __init__(self, args, seqs_folder, transform): |
| | args.data_txt_path = args.val_data_txt_path |
| | super().__init__(args, seqs_folder, transform) |
| |
|
| |
|
| | def make_transforms_for_mot17(image_set, args=None): |
| |
|
| | normalize = T.MotCompose([ |
| | T.MotToTensor(), |
| | T.MotNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) |
| | ]) |
| | scales = [608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992] |
| |
|
| | if image_set == 'train': |
| | return T.MotCompose([ |
| | T.MotRandomHorizontalFlip(), |
| | T.MotRandomSelect( |
| | T.MotRandomResize(scales, max_size=1536), |
| | T.MotCompose([ |
| | T.MotRandomResize([800, 1000, 1200]), |
| | T.FixedMotRandomCrop(800, 1200), |
| | T.MotRandomResize(scales, max_size=1536), |
| | ]) |
| | ), |
| | T.MOTHSV(), |
| | normalize, |
| | ]) |
| |
|
| | if image_set == 'val': |
| | return T.MotCompose([ |
| | T.MotRandomResize([800], max_size=1333), |
| | normalize, |
| | ]) |
| |
|
| | raise ValueError(f'unknown {image_set}') |
| |
|
| |
|
| | def build_transform(args, image_set): |
| | mot17_train = make_transforms_for_mot17('train', args) |
| | mot17_test = make_transforms_for_mot17('val', args) |
| |
|
| | if image_set == 'train': |
| | return mot17_train |
| | elif image_set == 'val': |
| | return mot17_test |
| | else: |
| | raise NotImplementedError() |
| |
|
| |
|
| | def build(image_set, args): |
| | root = Path(args.mot_path) |
| | assert root.exists(), f'provided MOT path {root} does not exist' |
| | transform = build_transform(args, image_set) |
| | if image_set == 'train': |
| | data_txt_path = args.data_txt_path_train |
| | dataset = DetMOTDetection(args, data_txt_path=data_txt_path, seqs_folder=root, transform=transform) |
| | if image_set == 'val': |
| | data_txt_path = args.data_txt_path_val |
| | dataset = DetMOTDetection(args, data_txt_path=data_txt_path, seqs_folder=root, transform=transform) |
| | return dataset |
| |
|