Update app.py
Browse files
app.py
CHANGED
|
@@ -169,7 +169,7 @@ move2gpu(refcoco_models, refcoco_cfg)
|
|
| 169 |
move2gpu(vqa_models, vqa_cfg)
|
| 170 |
move2gpu(general_models, general_cfg)
|
| 171 |
move2gpu(video_caption_models, general_cfg)
|
| 172 |
-
move2gpu(
|
| 173 |
|
| 174 |
# # Initialize generator
|
| 175 |
caption_generator = caption_task.build_generator(caption_models, caption_cfg.generation)
|
|
@@ -198,7 +198,7 @@ pad_idx = general_task.src_dict.pad()
|
|
| 198 |
|
| 199 |
type_transform = transforms.Lambda(lambda x: x.float().div(255.0))
|
| 200 |
patch_video_resize_transform = transforms.Compose([
|
| 201 |
-
transforms.CenterCrop(
|
| 202 |
type_transform,
|
| 203 |
transforms.Normalize(mean=mean, std=std),
|
| 204 |
])
|
|
@@ -222,8 +222,8 @@ def process_video(video_path, max_num_frames=16, num_frames=16, sample_type='ran
|
|
| 222 |
|
| 223 |
def construct_video_sample(video_path):
|
| 224 |
|
| 225 |
-
patch_video = process_video(video_path, max_num_frames=16, num_frames=
|
| 226 |
-
patch_image = torch.zeros((3,
|
| 227 |
|
| 228 |
patch_type = torch.tensor([1])
|
| 229 |
patch_mask = torch.tensor([True])
|
|
@@ -279,7 +279,7 @@ def construct_audio_sample(audio_path):
|
|
| 279 |
|
| 280 |
|
| 281 |
patch_audio = process_audio(audio_path, sample_rate=48000, max_audio_len=480000, audio_cfg=AUDIO_CFG)
|
| 282 |
-
patch_image = torch.zeros((3,
|
| 283 |
|
| 284 |
patch_type = torch.tensor([2])
|
| 285 |
patch_mask = torch.tensor([True])
|
|
|
|
| 169 |
move2gpu(vqa_models, vqa_cfg)
|
| 170 |
move2gpu(general_models, general_cfg)
|
| 171 |
move2gpu(video_caption_models, general_cfg)
|
| 172 |
+
move2gpu(audio_caption_models, general_cfg)
|
| 173 |
|
| 174 |
# # Initialize generator
|
| 175 |
caption_generator = caption_task.build_generator(caption_models, caption_cfg.generation)
|
|
|
|
| 198 |
|
| 199 |
type_transform = transforms.Lambda(lambda x: x.float().div(255.0))
|
| 200 |
patch_video_resize_transform = transforms.Compose([
|
| 201 |
+
transforms.CenterCrop(video_caption_cfg.task.patch_frame_size),
|
| 202 |
type_transform,
|
| 203 |
transforms.Normalize(mean=mean, std=std),
|
| 204 |
])
|
|
|
|
| 222 |
|
| 223 |
def construct_video_sample(video_path):
|
| 224 |
|
| 225 |
+
patch_video = process_video(video_path, max_num_frames=16, num_frames=video_caption_cfg.task.num_frames, sample_type=video_caption_cfg.task.sample_type,)
|
| 226 |
+
patch_image = torch.zeros((3, video_caption_cfg.task.patch_image_size, video_caption_cfg.task.patch_image_size))
|
| 227 |
|
| 228 |
patch_type = torch.tensor([1])
|
| 229 |
patch_mask = torch.tensor([True])
|
|
|
|
| 279 |
|
| 280 |
|
| 281 |
patch_audio = process_audio(audio_path, sample_rate=48000, max_audio_len=480000, audio_cfg=AUDIO_CFG)
|
| 282 |
+
patch_image = torch.zeros((3, audio_caption_cfg.task.patch_image_size, audio_caption_cfg.task.patch_image_size))
|
| 283 |
|
| 284 |
patch_type = torch.tensor([2])
|
| 285 |
patch_mask = torch.tensor([True])
|