Spaces:

nouvellevision
/

VSL-VideoMAE

Build error

App Files Files Community

tanthinhdt commited on May 11, 2024

Commit

e7a4186

1 Parent(s): dc6d681

feat: use ONNX model

Browse files

Files changed (5) hide show

app.py +40 -30
config.json +235 -0
preprocessor_config.json +27 -0
utils.py +7 -12
videomae_skeleton_v2.3.onnx +3 -0

app.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import gradio as gr
 from mediapipe.python.solutions import holistic
 from torchvision.transforms.v2 import Compose, Lambda, Normalize
-from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
 from utils import get_predictions, preprocess
 title = '''
 '''
@@ -21,21 +22,17 @@ examples = [
     ['000_con_cho.mp4'],
 ]
-# Initialize the model and image processor.
-device = 'cpu'
-model_name = 'VieSignLang/videomae_skeleton_v1.0'
-image_processor = VideoMAEImageProcessor.from_pretrained(model_name)
-model = VideoMAEForVideoClassification.from_pretrained(model_name)
-model = model.eval().to(device)
-# Get the mean, std, and model input size.
-mean = image_processor.image_mean
-std = image_processor.image_std
-if 'shortest_edge' in image_processor.size:
-    model_input_height = model_input_width = image_processor.size['shortest_edge']
 else:
-    model_input_height = image_processor.size['height']
-    model_input_width = image_processor.size['width']
 # Define the transform.
 transform = Compose(
@@ -73,38 +70,51 @@ def inference(
         refine_face_landmarks=True,
     )
     inputs = preprocess(
-        model_num_frames=model.config.num_frames,
         keypoints_detector=keypoints_detector,
         source=video,
         model_input_height=model_input_height,
         model_input_width=model_input_width,
-        device=device,
         transform=transform,
     )
     progress(1/2, desc='Getting predictions')
-    predictions = get_predictions(inputs=inputs, model=model)
     if len(predictions) == 0:
         output_message = 'No sign language detected in the video. Please try again.'
     else:
         output_message = 'The top-3 predictions are:\n'
         for i, prediction in enumerate(predictions):
-            output_message += f'{i+1}. {prediction["label"]} ({prediction["score"]:2f})\n'
-        output_message = output_message.strip()
     progress(1/2, desc='Completed')
     return output_message
-iface = gr.Interface(
-    fn=inference,
-    inputs='video',
-    outputs='text',
-    examples=examples,
-    title=title,
-    description=description,
-)
-iface.launch()

+import json
 import gradio as gr
+from time import time
+import onnxruntime as ort
 from mediapipe.python.solutions import holistic
 from torchvision.transforms.v2 import Compose, Lambda, Normalize
 from utils import get_predictions, preprocess
 title = '''
 '''
     ['000_con_cho.mp4'],
 ]
+ort_session = ort.InferenceSession('videomae_skeleton_v2.3.onnx')
+model_config = json.load(open('config.json'))
+preprocessor_config = json.load(open('preprocessor_config.json'))
+mean = preprocessor_config['image_mean']
+std = preprocessor_config['image_std']
+if 'shortest_edge' in preprocessor_config['size']:
+    model_input_height = model_input_width = preprocessor_config['size']['shortest_edge']
 else:
+    model_input_height = preprocessor_config['size']['height']
+    model_input_width = preprocessor_config['size']['width']
 # Define the transform.
 transform = Compose(
         refine_face_landmarks=True,
     )
+    start_time = time()
     inputs = preprocess(
+        model_num_frames=model_config['num_frames'],
         keypoints_detector=keypoints_detector,
         source=video,
         model_input_height=model_input_height,
         model_input_width=model_input_width,
         transform=transform,
     )
+    end_time = time()
+    data_time = end_time - start_time
     progress(1/2, desc='Getting predictions')
+    start_time = time()
+    predictions = get_predictions(
+        inputs=inputs,
+        ort_session=ort_session,
+        id2gloss=model_config['id2label'],
+        k=3,
+    )
+    end_time = time()
+    model_time = end_time - start_time
     if len(predictions) == 0:
         output_message = 'No sign language detected in the video. Please try again.'
     else:
         output_message = 'The top-3 predictions are:\n'
         for i, prediction in enumerate(predictions):
+            output_message += f'\t{i+1}. {prediction["label"]} ({prediction["score"]:2f})\n'
+        output_message += f'Data processing time: {data_time:.2f} seconds\n'
+        output_message += f'Model inference time: {model_time:.2f} seconds\n'
+        output_message += f'Total time: {data_time + model_time:.2f} seconds'
     progress(1/2, desc='Completed')
     return output_message
+# iface = gr.Interface(
+#     fn=inference,
+#     inputs='video',
+#     outputs='text',
+#     examples=examples,
+#     title=title,
+#     description=description,
+# )
+# iface.launch()
+print(inference('000_con_cho.mp4'))

config.json ADDED Viewed

	@@ -0,0 +1,235 @@

+{
+  "_name_or_path": "VieSignLang/videomae_skeleton_v2.3",
+  "architectures": [
+    "VideoMAEForVideoClassification"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "decoder_hidden_size": 192,
+  "decoder_intermediate_size": 768,
+  "decoder_num_attention_heads": 3,
+  "decoder_num_hidden_layers": 12,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 384,
+  "id2label": {
+    "0": "Con ch\u00f3",
+    "1": "Con m\u00e8o",
+    "2": "Con g\u00e0",
+    "3": "Con v\u1ecbt",
+    "4": "Con r\u00f9a",
+    "5": "Con th\u1ecf",
+    "6": "Con tr\u00e2u",
+    "7": "Con b\u00f2",
+    "8": "Con d\u00ea",
+    "9": "Con heo",
+    "10": "M\u00e0u \u0111en",
+    "11": "M\u00e0u tr\u1eafng",
+    "12": "M\u00e0u \u0111\u1ecf",
+    "13": "M\u00e0u cam",
+    "14": "M\u00e0u v\u00e0ng",
+    "15": "M\u00e0u l\u00e1 c\u00e2y",
+    "16": "M\u00e0u da tr\u1eddi",
+    "17": "M\u00e0u h\u1ed3ng",
+    "18": "M\u00e0u t\u00edm",
+    "19": "M\u00e0u n\u00e2u",
+    "20": "Qu\u1ea3 d\u00e2u",
+    "21": "Qu\u1ea3 m\u1eadn",
+    "22": "Qu\u1ea3 d\u1ee9a",
+    "23": "Qu\u1ea3 \u0111\u00e0o",
+    "24": "Qu\u1ea3 \u0111u \u0111\u1ee7",
+    "25": "Qu\u1ea3 cam",
+    "26": "Qu\u1ea3 b\u01a1",
+    "27": "Qu\u1ea3 chu\u1ed1i",
+    "28": "Qu\u1ea3 xo\u00e0i",
+    "29": "Qu\u1ea3 d\u1eeba",
+    "30": "B\u1ed1",
+    "31": "M\u1eb9",
+    "32": "Con trai",
+    "33": "Con g\u00e1i",
+    "34": "V\u1ee3",
+    "35": "Ch\u1ed3ng",
+    "36": "\u00d4ng n\u1ed9i",
+    "37": "B\u00e0 n\u1ed9i",
+    "38": "\u00d4ng ngo\u1ea1i",
+    "39": "B\u00e0 ngo\u1ea1i",
+    "40": "\u0102n",
+    "41": "U\u1ed1ng",
+    "42": "Xem",
+    "43": "Th\u00e8m",
+    "44": "M\u00e1ch",
+    "45": "Kh\u00f3c",
+    "46": "C\u01b0\u1eddi",
+    "47": "H\u1ecdc",
+    "48": "D\u1ed7i",
+    "49": "Ch\u1ebft",
+    "50": "\u0110i",
+    "51": "Ch\u1ea1y",
+    "52": "B\u1eadn",
+    "53": "H\u00e1t",
+    "54": "M\u00faa",
+    "55": "N\u1ea5u",
+    "56": "N\u01b0\u1edbng",
+    "57": "Nh\u1ea7m l\u1eabn",
+    "58": "Quan s\u00e1t",
+    "59": "C\u1eafm tr\u1ea1i",
+    "60": "Cung c\u1ea5p",
+    "61": "B\u1eaft ch\u01b0\u1edbc",
+    "62": "B\u1eaft bu\u1ed9c",
+    "63": "B\u00e1o c\u00e1o",
+    "64": "Mua b\u00e1n",
+    "65": "Kh\u00f4ng quen",
+    "66": "Kh\u00f4ng n\u00ean",
+    "67": "Kh\u00f4ng c\u1ea7n",
+    "68": "Kh\u00f4ng cho",
+    "69": "Kh\u00f4ng nghe l\u1eddi",
+    "70": "M\u1eb7n",
+    "71": "\u0110\u1eafng",
+    "72": "Cay",
+    "73": "Ng\u1ecdt",
+    "74": "\u0110\u1eadm",
+    "75": "Nh\u1ea1t",
+    "76": "Ngon mi\u1ec7ng",
+    "77": "X\u1ea5u",
+    "78": "\u0110\u1eb9p",
+    "79": "Ch\u1eadt",
+    "80": "H\u1eb9p",
+    "81": "R\u1ed9ng",
+    "82": "D\u00e0i",
+    "83": "Cao",
+    "84": "L\u00f9n",
+    "85": "\u1ed0m",
+    "86": "M\u1eadp",
+    "87": "Ngoan",
+    "88": "H\u01b0",
+    "89": "Kh\u1ecfe",
+    "90": "M\u1ec7t",
+    "91": "\u0110au",
+    "92": "Gi\u1ecfi",
+    "93": "Ch\u0103m ch\u1ec9",
+    "94": "L\u01b0\u1eddi bi\u1ebfng",
+    "95": "T\u1ed1t b\u1ee5ng",
+    "96": "Th\u00fa v\u1ecb",
+    "97": "H\u00e0i h\u01b0\u1edbc",
+    "98": "D\u0169ng c\u1ea3m",
+    "99": "S\u00e1ng t\u1ea1o"
+  },
+  "image_size": 224,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "label2id": {
+    "B\u00e0 ngo\u1ea1i": 39,
+    "B\u00e0 n\u1ed9i": 37,
+    "B\u00e1o c\u00e1o": 63,
+    "B\u1eadn": 52,
+    "B\u1eaft bu\u1ed9c": 62,
+    "B\u1eaft ch\u01b0\u1edbc": 61,
+    "B\u1ed1": 30,
+    "Cao": 83,
+    "Cay": 72,
+    "Ch\u0103m ch\u1ec9": 93,
+    "Ch\u1ea1y": 51,
+    "Ch\u1eadt": 79,
+    "Ch\u1ebft": 49,
+    "Ch\u1ed3ng": 35,
+    "Con b\u00f2": 7,
+    "Con ch\u00f3": 0,
+    "Con d\u00ea": 8,
+    "Con g\u00e0": 2,
+    "Con g\u00e1i": 33,
+    "Con heo": 9,
+    "Con m\u00e8o": 1,
+    "Con r\u00f9a": 4,
+    "Con th\u1ecf": 5,
+    "Con trai": 32,
+    "Con tr\u00e2u": 6,
+    "Con v\u1ecbt": 3,
+    "Cung c\u1ea5p": 60,
+    "C\u01b0\u1eddi": 46,
+    "C\u1eafm tr\u1ea1i": 59,
+    "D\u00e0i": 82,
+    "D\u0169ng c\u1ea3m": 98,
+    "D\u1ed7i": 48,
+    "Gi\u1ecfi": 92,
+    "H\u00e0i h\u01b0\u1edbc": 97,
+    "H\u00e1t": 53,
+    "H\u01b0": 88,
+    "H\u1eb9p": 80,
+    "H\u1ecdc": 47,
+    "Kh\u00f3c": 45,
+    "Kh\u00f4ng cho": 68,
+    "Kh\u00f4ng c\u1ea7n": 67,
+    "Kh\u00f4ng nghe l\u1eddi": 69,
+    "Kh\u00f4ng n\u00ean": 66,
+    "Kh\u00f4ng quen": 65,
+    "Kh\u1ecfe": 89,
+    "L\u00f9n": 84,
+    "L\u01b0\u1eddi bi\u1ebfng": 94,
+    "Mua b\u00e1n": 64,
+    "M\u00e0u cam": 13,
+    "M\u00e0u da tr\u1eddi": 16,
+    "M\u00e0u h\u1ed3ng": 17,
+    "M\u00e0u l\u00e1 c\u00e2y": 15,
+    "M\u00e0u n\u00e2u": 19,
+    "M\u00e0u tr\u1eafng": 11,
+    "M\u00e0u t\u00edm": 18,
+    "M\u00e0u v\u00e0ng": 14,
+    "M\u00e0u \u0111en": 10,
+    "M\u00e0u \u0111\u1ecf": 12,
+    "M\u00e1ch": 44,
+    "M\u00faa": 54,
+    "M\u1eadp": 86,
+    "M\u1eb7n": 70,
+    "M\u1eb9": 31,
+    "M\u1ec7t": 90,
+    "Ngoan": 87,
+    "Ngon mi\u1ec7ng": 76,
+    "Ng\u1ecdt": 73,
+    "Nh\u1ea1t": 75,
+    "Nh\u1ea7m l\u1eabn": 57,
+    "N\u01b0\u1edbng": 56,
+    "N\u1ea5u": 55,
+    "Quan s\u00e1t": 58,
+    "Qu\u1ea3 b\u01a1": 26,
+    "Qu\u1ea3 cam": 25,
+    "Qu\u1ea3 chu\u1ed1i": 27,
+    "Qu\u1ea3 d\u00e2u": 20,
+    "Qu\u1ea3 d\u1ee9a": 22,
+    "Qu\u1ea3 d\u1eeba": 29,
+    "Qu\u1ea3 m\u1eadn": 21,
+    "Qu\u1ea3 xo\u00e0i": 28,
+    "Qu\u1ea3 \u0111u \u0111\u1ee7": 24,
+    "Qu\u1ea3 \u0111\u00e0o": 23,
+    "R\u1ed9ng": 81,
+    "S\u00e1ng t\u1ea1o": 99,
+    "Th\u00e8m": 43,
+    "Th\u00fa v\u1ecb": 96,
+    "T\u1ed1t b\u1ee5ng": 95,
+    "U\u1ed1ng": 41,
+    "V\u1ee3": 34,
+    "Xem": 42,
+    "X\u1ea5u": 77,
+    "\u00d4ng ngo\u1ea1i": 38,
+    "\u00d4ng n\u1ed9i": 36,
+    "\u0102n": 40,
+    "\u0110au": 91,
+    "\u0110i": 50,
+    "\u0110\u1eadm": 74,
+    "\u0110\u1eafng": 71,
+    "\u0110\u1eb9p": 78,
+    "\u1ed0m": 85
+  },
+  "layer_norm_eps": 1e-12,
+  "model_type": "videomae",
+  "norm_pix_loss": true,
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_frames": 16,
+  "num_hidden_layers": 12,
+  "patch_size": 16,
+  "problem_type": "single_label_classification",
+  "qkv_bias": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.28.1",
+  "tubelet_size": 2,
+  "use_mean_pooling": true
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "feature_extractor_type": "VideoMAEFeatureExtractor",
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "VideoMAEImageProcessor",
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

utils.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import cv2
-import torch
 import numpy as np
 from mediapipe.python.solutions import (drawing_styles, drawing_utils,
                                         holistic, pose)
 from torchvision.transforms.v2 import Compose, UniformTemporalSubsample
-from transformers import VideoMAEForVideoClassification
 def draw_skeleton_on_image(
@@ -178,7 +178,8 @@ def do_hands_relax(
 def get_predictions(
     inputs: dict,
-    model: VideoMAEForVideoClassification,
     k: int = 3,
 ) -> list:
     '''
@@ -201,9 +202,7 @@ def get_predictions(
     if inputs is None:
         return []
-    with torch.no_grad():
-        outputs = model(**inputs)
-        logits = outputs.logits
     # Get top-3 predictions
     topk_scores, topk_indices = torch.topk(logits, k, dim=1)
@@ -212,7 +211,7 @@ def get_predictions(
     return [
         {
-            'label': model.config.id2label[topk_indices[i]],
             'score': topk_scores[i],
         }
         for i in range(k)
@@ -225,7 +224,6 @@ def preprocess(
     source: str,
     model_input_height: int,
     model_input_width: int,
-    device: str,
     transform: Compose,
 ) -> dict:
     '''
@@ -243,8 +241,6 @@ def preprocess(
         Model input height.
     model_input_width : int
         Model input width.
-    device : str
-        Device to use.
     transform : Compose
         Transform to apply.
@@ -292,8 +288,7 @@ def preprocess(
     skeleton_video = torch.stack(skeleton_video)
     skeleton_video = UniformTemporalSubsample(model_num_frames)(skeleton_video)
     inputs = {
-        'pixel_values': skeleton_video.unsqueeze(0),
     }
-    inputs = {k: v.to(device) for k, v in inputs.items()}
     return inputs

 import cv2
 import numpy as np
+import onnxruntime as ort
+import torch
 from mediapipe.python.solutions import (drawing_styles, drawing_utils,
                                         holistic, pose)
 from torchvision.transforms.v2 import Compose, UniformTemporalSubsample
 def draw_skeleton_on_image(
 def get_predictions(
     inputs: dict,
+    ort_session: ort.InferenceSession,
+    id2gloss: dict,
     k: int = 3,
 ) -> list:
     '''
     if inputs is None:
         return []
+    logits = torch.from_numpy(ort_session.run(None, inputs)[0])
     # Get top-3 predictions
     topk_scores, topk_indices = torch.topk(logits, k, dim=1)
     return [
         {
+            'label': id2gloss[str(topk_indices[i])],
             'score': topk_scores[i],
         }
         for i in range(k)
     source: str,
     model_input_height: int,
     model_input_width: int,
     transform: Compose,
 ) -> dict:
     '''
         Model input height.
     model_input_width : int
         Model input width.
     transform : Compose
         Transform to apply.
     skeleton_video = torch.stack(skeleton_video)
     skeleton_video = UniformTemporalSubsample(model_num_frames)(skeleton_video)
     inputs = {
+        'pixel_values': skeleton_video.unsqueeze(0).numpy(),
     }
     return inputs

videomae_skeleton_v2.3.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:271d0e3d932fffc036b6cef4f8c90721e223e32816ef16bb853c890b0f3b90c7
+size 90390035