File size: 5,529 Bytes
878264b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | # Copyright Howto100M authors.
# Copyright (c) Facebook, Inc. All Rights Reserved
import torch as th
import torch.nn.functional as F
import math
import numpy as np
import argparse
from torch.utils.data import DataLoader
from model import get_model
from preprocessing import Preprocessing
from random_sequence_shuffler import RandomSequenceSampler
from tqdm import tqdm
from pathbuilder import PathBuilder
from videoreader import VideoLoader
parser = argparse.ArgumentParser(description='Easy video feature extractor')
parser.add_argument('--vdir', type=str)
parser.add_argument('--fdir', type=str)
parser.add_argument('--hflip', type=int, default=0)
parser.add_argument('--batch_size', type=int, default=64,
help='batch size')
parser.add_argument('--type', type=str, default='2d',
help='CNN type')
parser.add_argument('--half_precision', type=int, default=0,
help='output half precision float')
parser.add_argument('--num_decoding_thread', type=int, default=4,
help='Num parallel thread for video decoding')
parser.add_argument('--l2_normalize', type=int, default=1,
help='l2 normalize feature')
parser.add_argument('--resnext101_model_path', type=str, default='model/resnext101.pth',
help='Resnext model path')
parser.add_argument('--vmz_model_path', type=str, default='model/r2plus1d_34_clip8_ig65m_from_scratch-9bae36ae.pth',
help='vmz model path')
args = parser.parse_args()
# TODO: refactor all args into config. (current code is from different people.)
CONFIGS = {
"2d": {
"fps": 1,
"size": 224,
"centercrop": False,
"shards": 0,
},
"3d": {
"fps": 24,
"size": 112,
"centercrop": True,
"shards": 0,
},
"s3d": {
"fps": 30,
"size": 224,
"centercrop": True,
"shards": 0,
},
"vmz": {
"fps": 24,
"size": 112,
"centercrop": True,
"shards": 0,
},
"vae": {
"fps": 2,
"size": 256,
"centercrop": True,
"shards": 100,
}
}
config = CONFIGS[args.type]
video_dirs = args.vdir
feature_dir = args.fdir
video_dict = PathBuilder.build(video_dirs, feature_dir, ".npy", config["shards"])
dataset = VideoLoader(
video_dict=video_dict,
framerate=config["fps"],
size=config["size"],
centercrop=config["centercrop"],
hflip=args.hflip
)
n_dataset = len(dataset)
sampler = RandomSequenceSampler(n_dataset, 10)
loader = DataLoader(
dataset,
batch_size=1,
shuffle=False,
num_workers=args.num_decoding_thread,
sampler=sampler if n_dataset > 10 else None,
)
preprocess = Preprocessing(args.type)
model = get_model(args)
with th.no_grad():
for k, data in tqdm(enumerate(loader), total=loader.__len__(), ascii=True):
input_file = data['input'][0]
output_file = data['output'][0]
if len(data['video'].shape) > 3:
video = data['video'].squeeze()
if len(video.shape) == 4:
video = preprocess(video)
n_chunk = len(video)
if args.type == 'vmz':
n_chunk = math.ceil(n_chunk/float(3))
features = th.cuda.FloatTensor(n_chunk, 512).fill_(0)
elif args.type == 's3d':
features = th.cuda.FloatTensor(n_chunk, 512).fill_(0)
elif args.type == "vae":
features = th.cuda.LongTensor(n_chunk, 1024).fill_(0)
else:
features = th.cuda.FloatTensor(n_chunk, 2048).fill_(0)
n_iter = int(math.ceil(n_chunk / float(args.batch_size)))
for i in range(n_iter):
factor = 1
if args.type == 'vmz':
factor = 3
min_ind = factor * i * args.batch_size
max_ind = factor * (i + 1) * args.batch_size
video_batch = video[min_ind:max_ind:factor].cuda()
if args.type == '2d':
batch_features = model(video_batch) # (51, 487), (51, 512)
elif args.type == 's3d':
batch_features = model(video_batch)
batch_features = batch_features['video_embedding']
elif args.type == "vae":
# image_code.
batch_features = model(video_batch)
else:
batch_pred, batch_features = model(video_batch) # (51, 487), (51, 512)
if args.l2_normalize:
batch_features = F.normalize(batch_features, dim=1)
features[i*args.batch_size:(i+1)*args.batch_size] = batch_features
features = features.cpu().numpy()
if args.half_precision:
if args.type == "vae":
features = features.astype(np.int16)
else:
features = features.astype('float16')
else:
if args.type == "vae":
features = features.astype(np.int32)
else:
features = features.astype('float32')
np.save(output_file, features)
else:
print('Video {} error.'.format(input_file))
|