| import clip |
| import numpy as np |
| import torch |
| from mmaction.datasets.transforms import (CenterCrop, DecordDecode, DecordInit, |
| FormatShape, Resize) |
| from torchvision import transforms |
|
|
|
|
| def extract_clip_feature_single_video_fps( |
| video_path: str, |
| clip_ckpt_path: str = 'ViT-L-14.pt', |
| device: str = 'cuda'): |
|
|
| class SampleFrames1FPS(object): |
| '''Sample frames at 1 fps. |
| |
| Required Keys: |
| - total_frames |
| - start_index |
| - avg_fps |
| |
| Added Keys: |
| - frame_interval |
| - frame_inds |
| - num_clips |
| ''' |
|
|
| def transform(self, video_info: dict) -> dict: |
| video_info['frame_inds'] = np.arange( |
| video_info['start_index'], |
| video_info['total_frames'], |
| video_info['avg_fps'], |
| dtype=int) |
| video_info['frame_interval'] = 1 |
| video_info['num_clips'] = len(video_info['frame_inds']) |
| return video_info |
|
|
| class SampleFrames5FPS(object): |
| '''Sample frames at 5 fps. |
| |
| Required Keys: |
| - total_frames |
| - start_index |
| - avg_fps |
| |
| Added Keys: |
| - frame_interval |
| - frame_inds |
| - num_clips |
| ''' |
|
|
| def transform(self, video_info: dict) -> dict: |
| video_info['frame_inds'] = np.arange( |
| video_info['start_index'], |
| video_info['total_frames'], |
| video_info['avg_fps'] // 5, |
| dtype=int) |
| video_info['frame_interval'] = 1 |
| video_info['num_clips'] = len(video_info['frame_inds']) |
| return video_info |
|
|
| video_info = {'filename': video_path, 'start_index': 0} |
| video_processors = [ |
| DecordInit(), |
| SampleFrames1FPS(), |
| DecordDecode(), |
| Resize(scale=(-1, 224)), |
| CenterCrop(crop_size=224), |
| FormatShape(input_format='NCHW'), |
| ] |
|
|
| |
| for processor in video_processors: |
| video_info = processor.transform(video_info) |
|
|
| imgs = torch.from_numpy(video_info['imgs']) |
|
|
| imgs_transforms = transforms.Compose([ |
| transforms.ConvertImageDtype(dtype=torch.float32), |
| transforms.Normalize( |
| mean=(0.48145466, 0.4578275, 0.40821073), |
| std=(0.26862954, 0.26130258, 0.27577711), |
| inplace=False) |
| ]) |
|
|
| |
| imgs = imgs_transforms(imgs).to(device) |
|
|
| |
| clip_model, _ = clip.load(clip_ckpt_path, device) |
|
|
| |
| with torch.no_grad(): |
| video_feat = clip_model.encode_image(imgs) |
|
|
| return video_feat, video_info |
|
|
|
|
| if __name__ == '__main__': |
|
|
| device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| video_names = [ |
| 'cook.mp4', 'latex.mp4', 'nba.mp4', 'temple_of_heaven.mp4', |
| 'south_pole.mp4', 'tv_series.mp4', 'formula_one.mp4', 'make-up.mp4', |
| 'police.mp4' |
| ] |
| video_dir = '/mnt/petrelfs/wangyiqin/vid_cap/examples/videos/' |
|
|
| for video_name in video_names: |
| video_feat = extract_clip_feature_single_video_fps( |
| video_path=video_dir + video_name, |
| clip_ckpt_path='ViT-L-14.pt', |
| device=device) |
| video_feat = video_feat.cpu() |
| |
| video_feat = video_feat.numpy() |
|
|
| np.save('clip_features/20/' + video_name[:-4] + '.npy', video_feat) |
| print(video_feat.shape) |
| print(video_name + ' DONE') |
|
|