Spaces:

innat
/

Video-FocalNet

Runtime error

App Files Files Community

innat commited on Nov 1, 2023

Commit

222819d

1 Parent(s): 6022c8d

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -89

app.py CHANGED Viewed

@@ -6,114 +6,59 @@ import imageio
 import tensorflow as tf
 from tensorflow import keras
-from utils import TubeMaskingGenerator
-from utils import read_video, frame_sampling, denormalize, reconstrunction
-from utils import IMAGENET_MEAN, IMAGENET_STD, num_frames, patch_size, input_size
-from labels import K400_label_map, SSv2_label_map, UCF_label_map
 LABEL_MAPS = {
     'K400': K400_label_map,
-    'SSv2': SSv2_label_map,
-    'UCF' : UCF_label_map
 }
 ALL_MODELS = [
-    'TFVideoMAE_L_K400_16x224',
-    'TFVideoMAE_B_SSv2_16x224',
-    'TFVideoMAE_B_UCF_16x224',
 ]
 sample_example = [
-    ["examples/k400.mp4",  ALL_MODELS[0], 0.9],
-    ["examples/ssv2.mp4",  ALL_MODELS[1], 0.8],
-    ["examples/ucf.mp4",   ALL_MODELS[2], 0.7],
 ]
-def tube_mask_generator(mask_ratio):
-    window_size = (
-        num_frames // 2,
-        input_size // patch_size[0],
-        input_size // patch_size[1]
-    )
-    tube_mask = TubeMaskingGenerator(
-        input_size=window_size,
-        mask_ratio=mask_ratio
-    )
-    make_bool = tube_mask()
-    bool_masked_pos_tf = tf.constant(make_bool, dtype=tf.int32)
-    bool_masked_pos_tf = tf.expand_dims(bool_masked_pos_tf, axis=0)
-    bool_masked_pos_tf = tf.cast(bool_masked_pos_tf, tf.bool)
-    return bool_masked_pos_tf
 def get_model(model_type):
-    ft_path = keras.utils.get_file(
-        origin=f'https://github.com/innat/VideoMAE/releases/download/v1.1/{model_type}_FT.zip',
-    )
-    pt_path = keras.utils.get_file(
-        origin=f'https://github.com/innat/VideoMAE/releases/download/v1.1/{model_type}_PT.zip',
     )
-    with zipfile.ZipFile(ft_path, 'r') as zip_ref:
         zip_ref.extractall('./')
-    with zipfile.ZipFile(pt_path, 'r') as zip_ref:
-        zip_ref.extractall('./')
-    ft_model = keras.models.load_model(model_type + '_FT')
-    pt_model = keras.models.load_model(model_type + '_PT')
     if 'K400' in model_type:
         data_type = 'K400'
-    elif 'SSv2' in model_type:
-        data_type = 'SSv2'
     else:
-        data_type = 'UCF'
     label_map = LABEL_MAPS.get(data_type)
     label_map = {v: k for k, v in label_map.items()}
-    return ft_model, pt_model, label_map
-def inference(video_file, model_type, mask_ratio):
     # get sample data
     container = read_video(video_file)
     frames = frame_sampling(container, num_frames=num_frames)
     # get models
-    bool_masked_pos_tf = tube_mask_generator(mask_ratio)
-    ft_model, pt_model, label_map = get_model(model_type)
-    ft_model.trainable = False
-    pt_model.trainable = False
-    # inference on fine-tune model
-    outputs_ft = ft_model(frames[None, ...], training=False)
-    probabilities = tf.nn.softmax(outputs_ft).numpy().squeeze(0)
     confidences = {
         label_map[i]: float(probabilities[i]) for i in np.argsort(probabilities)[::-1]
     }
-    # inference on pre-trained model
-    outputs_pt = pt_model(frames[None, ...], bool_masked_pos_tf, training=False)
-    reconstruct_output, mask = reconstrunction(
-        frames[None, ...], bool_masked_pos_tf, outputs_pt
-    )
-    # post process
-    input_frame = denormalize(frames)
-    input_mask = denormalize(mask[0] * frames)
-    output_frame = denormalize(reconstruct_output)
-    frames = []
-    for frame_a, frame_b, frame_c in zip(input_frame, input_mask, output_frame):
-        combined_frame = np.hstack([frame_a, frame_b, frame_c])
-        frames.append(combined_frame)
-    combined_gif = 'combined.gif'
-    imageio.mimsave(combined_gif, frames, duration=300, loop=0)
-    return confidences, combined_gif
 def main():
@@ -123,26 +68,14 @@ def main():
             gr.Video(type="file", label="Input Video"),
             gr.Dropdown(
                 choices=ALL_MODELS,
-                default="TFVideoMAE_L_K400_16x224",
                 label="Model"
-            ),
-            gr.Slider(
-                0.5,
-                1.0,
-                step=0.1,
-                default=0.5,
-                label='Mask Ratio'
             )
         ],
-        outputs=[
-            gr.Label(num_top_classes=3, label='scores'),
-            gr.Image(type="filepath", label='reconstructed')
-        ],
         examples=sample_example,
-        title="VideoMAE",
-        description="Keras reimplementation of <a href='https://github.com/innat/VideoMAE'>VideoMAE</a> is presented here."
     )
     iface.launch()
 if __name__ == '__main__':

 import tensorflow as tf
 from tensorflow import keras
+from utils import read_video, frame_sampling
+from utils import num_frames, patch_size, input_size
+from labels import K400_label_map
 LABEL_MAPS = {
     'K400': K400_label_map,
 }
 ALL_MODELS = [
+    'TFVideoFocalNetB_K400_8x224',
 ]
 sample_example = [
+    ["examples/k400.mp4",  ALL_MODELS[0]],
 ]
 def get_model(model_type):
+    model_path = keras.utils.get_file(
+        origin=f'https://github.com/innat/Video-FocalNets/releases/download/v1.1/{model_type}.zip',
     )
+    with zipfile.ZipFile(model_path, 'r') as zip_ref:
         zip_ref.extractall('./')
+    model = keras.models.load_model(model_type)
     if 'K400' in model_type:
         data_type = 'K400'
     else:
+        data_type = 'SSv2'
     label_map = LABEL_MAPS.get(data_type)
     label_map = {v: k for k, v in label_map.items()}
+    return model, label_map
+def inference(video_file, model_type):
     # get sample data
     container = read_video(video_file)
     frames = frame_sampling(container, num_frames=num_frames)
     # get models
+    model, label_map = get_model(model_type)
+    model.trainable = False
+    # inference on model
+    outputs = model(frames[None, ...], training=False)
+    probabilities = tf.nn.softmax(outputs).numpy().squeeze(0)
     confidences = {
         label_map[i]: float(probabilities[i]) for i in np.argsort(probabilities)[::-1]
     }
+    return confidences
 def main():
             gr.Video(type="file", label="Input Video"),
             gr.Dropdown(
                 choices=ALL_MODELS,
                 label="Model"
             )
         ],
+        outputs=gr.Label(num_top_classes=3, label='scores'),
         examples=sample_example,
+        title="Video-FocalNets: Spatio-Temporal Focal Modulation.",
+        description="Keras reimplementation of <a href='https://github.com/innat/Video-FocalNets'>Video-FocalNets</a> is presented here."
     )
     iface.launch()
 if __name__ == '__main__':