| | |
| | |
| |
|
| | import torch as th |
| | import torch.nn.functional as F |
| | import math |
| | import numpy as np |
| | import argparse |
| |
|
| | from torch.utils.data import DataLoader |
| | from model import get_model |
| | from preprocessing import Preprocessing |
| | from random_sequence_shuffler import RandomSequenceSampler |
| |
|
| | from tqdm import tqdm |
| | from pathbuilder import PathBuilder |
| | from videoreader import VideoLoader |
| |
|
| |
|
| | parser = argparse.ArgumentParser(description='Easy video feature extractor') |
| |
|
| | parser.add_argument('--vdir', type=str) |
| | parser.add_argument('--fdir', type=str) |
| | parser.add_argument('--hflip', type=int, default=0) |
| |
|
| | parser.add_argument('--batch_size', type=int, default=64, |
| | help='batch size') |
| | parser.add_argument('--type', type=str, default='2d', |
| | help='CNN type') |
| | parser.add_argument('--half_precision', type=int, default=0, |
| | help='output half precision float') |
| | parser.add_argument('--num_decoding_thread', type=int, default=4, |
| | help='Num parallel thread for video decoding') |
| | parser.add_argument('--l2_normalize', type=int, default=1, |
| | help='l2 normalize feature') |
| | parser.add_argument('--resnext101_model_path', type=str, default='model/resnext101.pth', |
| | help='Resnext model path') |
| | parser.add_argument('--vmz_model_path', type=str, default='model/r2plus1d_34_clip8_ig65m_from_scratch-9bae36ae.pth', |
| | help='vmz model path') |
| |
|
| | args = parser.parse_args() |
| |
|
| |
|
| | |
| | CONFIGS = { |
| | "2d": { |
| | "fps": 1, |
| | "size": 224, |
| | "centercrop": False, |
| | "shards": 0, |
| | }, |
| | "3d": { |
| | "fps": 24, |
| | "size": 112, |
| | "centercrop": True, |
| | "shards": 0, |
| | }, |
| | "s3d": { |
| | "fps": 30, |
| | "size": 224, |
| | "centercrop": True, |
| | "shards": 0, |
| | }, |
| | "vmz": { |
| | "fps": 24, |
| | "size": 112, |
| | "centercrop": True, |
| | "shards": 0, |
| | }, |
| | "vae": { |
| | "fps": 2, |
| | "size": 256, |
| | "centercrop": True, |
| | "shards": 100, |
| | } |
| | } |
| |
|
| | config = CONFIGS[args.type] |
| |
|
| |
|
| | video_dirs = args.vdir |
| | feature_dir = args.fdir |
| |
|
| | video_dict = PathBuilder.build(video_dirs, feature_dir, ".npy", config["shards"]) |
| |
|
| | dataset = VideoLoader( |
| | video_dict=video_dict, |
| | framerate=config["fps"], |
| | size=config["size"], |
| | centercrop=config["centercrop"], |
| | hflip=args.hflip |
| | ) |
| | n_dataset = len(dataset) |
| | sampler = RandomSequenceSampler(n_dataset, 10) |
| | loader = DataLoader( |
| | dataset, |
| | batch_size=1, |
| | shuffle=False, |
| | num_workers=args.num_decoding_thread, |
| | sampler=sampler if n_dataset > 10 else None, |
| | ) |
| | preprocess = Preprocessing(args.type) |
| | model = get_model(args) |
| |
|
| | with th.no_grad(): |
| | for k, data in tqdm(enumerate(loader), total=loader.__len__(), ascii=True): |
| | input_file = data['input'][0] |
| | output_file = data['output'][0] |
| | if len(data['video'].shape) > 3: |
| | video = data['video'].squeeze() |
| | if len(video.shape) == 4: |
| | video = preprocess(video) |
| | n_chunk = len(video) |
| | if args.type == 'vmz': |
| | n_chunk = math.ceil(n_chunk/float(3)) |
| | features = th.cuda.FloatTensor(n_chunk, 512).fill_(0) |
| | elif args.type == 's3d': |
| | features = th.cuda.FloatTensor(n_chunk, 512).fill_(0) |
| | elif args.type == "vae": |
| | features = th.cuda.LongTensor(n_chunk, 1024).fill_(0) |
| | else: |
| | features = th.cuda.FloatTensor(n_chunk, 2048).fill_(0) |
| | n_iter = int(math.ceil(n_chunk / float(args.batch_size))) |
| | for i in range(n_iter): |
| | factor = 1 |
| | if args.type == 'vmz': |
| | factor = 3 |
| | min_ind = factor * i * args.batch_size |
| | max_ind = factor * (i + 1) * args.batch_size |
| | video_batch = video[min_ind:max_ind:factor].cuda() |
| | if args.type == '2d': |
| | batch_features = model(video_batch) |
| | elif args.type == 's3d': |
| | batch_features = model(video_batch) |
| | batch_features = batch_features['video_embedding'] |
| | elif args.type == "vae": |
| | |
| | batch_features = model(video_batch) |
| | else: |
| | batch_pred, batch_features = model(video_batch) |
| | if args.l2_normalize: |
| | batch_features = F.normalize(batch_features, dim=1) |
| | features[i*args.batch_size:(i+1)*args.batch_size] = batch_features |
| | features = features.cpu().numpy() |
| | if args.half_precision: |
| | if args.type == "vae": |
| | features = features.astype(np.int16) |
| | else: |
| | features = features.astype('float16') |
| | else: |
| | if args.type == "vae": |
| | features = features.astype(np.int32) |
| | else: |
| | features = features.astype('float32') |
| | np.save(output_file, features) |
| | else: |
| | print('Video {} error.'.format(input_file)) |
| |
|