Spaces:

RP-IPM
/

videomae-base-ipm

Sleeping

App Files Files Community

hocheewai commited on Jul 18, 2023

Commit

398626e

1 Parent(s): 9f6da01

Upload 11 files

Browse files

Files changed (12) hide show

.gitattributes +9 -0
app.py +140 -0
examples/20230704_112128.mp4 +3 -0
examples/20230704_112231.mp4 +3 -0
examples/20230704_112517.mp4 +3 -0
examples/20230704_112544.mp4 +3 -0
examples/20230704_112630.mp4 +3 -0
examples/20230704_112700.mp4 +3 -0
examples/bend.mp4 +3 -0
examples/cnw.mp4 +3 -0
examples/lift.mp4 +3 -0
requirements.txt +7 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/20230704_112128.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/20230704_112231.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/20230704_112517.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/20230704_112544.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/20230704_112630.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/20230704_112700.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/bend.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/cnw.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/lift.mp4 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import cv2
+import gradio as gr
+import imutils
+import numpy as np
+import torch
+from pytorchvideo.transforms import (
+    ApplyTransformToKey,
+    Normalize,
+    RandomShortSideScale,
+    RemoveKey,
+    ShortSideScale,
+    UniformTemporalSubsample,
+)
+from torchvision.transforms import (
+    Compose,
+    Lambda,
+    RandomCrop,
+    RandomHorizontalFlip,
+    Resize,
+)
+from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
+MODEL_CKPT = "rickysk/videomae-base-ipm_all_videos"
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL = VideoMAEForVideoClassification.from_pretrained(MODEL_CKPT).to(DEVICE)
+PROCESSOR = VideoMAEFeatureExtractor.from_pretrained(MODEL_CKPT)
+RESIZE_TO = PROCESSOR.size["shortest_edge"]
+NUM_FRAMES_TO_SAMPLE = MODEL.config.num_frames
+IMAGE_STATS = {"image_mean": [0.485, 0.456, 0.406], "image_std": [0.229, 0.224, 0.225]}
+VAL_TRANSFORMS = Compose(
+    [
+        UniformTemporalSubsample(NUM_FRAMES_TO_SAMPLE),
+        Lambda(lambda x: x / 255.0),
+        Normalize(IMAGE_STATS["image_mean"], IMAGE_STATS["image_std"]),
+        Resize((RESIZE_TO, RESIZE_TO)),
+    ]
+)
+LABELS = list(MODEL.config.label2id.keys())
+def parse_video(video_file):
+    """A utility to parse the input videos.
+    Reference: https://pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv/
+    """
+    vs = cv2.VideoCapture(video_file)
+    # try to determine the total number of frames in the video file
+    try:
+        prop = (
+            cv2.cv.CV_CAP_PROP_FRAME_COUNT
+            if imutils.is_cv2()
+            else cv2.CAP_PROP_FRAME_COUNT
+        )
+        total = int(vs.get(prop))
+        print("[INFO] {} total frames in video".format(total))
+    # an error occurred while trying to determine the total
+    # number of frames in the video file
+    except:
+        print("[INFO] could not determine # of frames in video")
+        print("[INFO] no approx. completion time can be provided")
+        total = -1
+    frames = []
+    # loop over frames from the video file stream
+    while True:
+        # read the next frame from the file
+        (grabbed, frame) = vs.read()
+        if frame is not None:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frames.append(frame)
+        # if the frame was not grabbed, then we have reached the end
+        # of the stream
+        if not grabbed:
+            break
+    return frames
+def preprocess_video(frames: list):
+    """Utility to apply preprocessing transformations to a video tensor."""
+    # Each frame in the `frames` list has the shape: (height, width, num_channels).
+    # Collated together the `frames` has the the shape: (num_frames, height, width, num_channels).
+    # So, after converting the `frames` list to a torch tensor, we permute the shape
+    # such that it becomes (num_channels, num_frames, height, width) to make
+    # the shape compatible with the preprocessing transformations. After applying the
+    # preprocessing chain, we permute the shape to (num_frames, num_channels, height, width)
+    # to make it compatible with the model. Finally, we add a batch dimension so that our video
+    # classification model can operate on it.
+    video_tensor = torch.tensor(np.array(frames).astype(frames[0].dtype))
+    video_tensor = video_tensor.permute(
+        3, 0, 1, 2
+    )  # (num_channels, num_frames, height, width)
+    video_tensor_pp = VAL_TRANSFORMS(video_tensor)
+    video_tensor_pp = video_tensor_pp.permute(
+        1, 0, 2, 3
+    )  # (num_frames, num_channels, height, width)
+    video_tensor_pp = video_tensor_pp.unsqueeze(0)
+    return video_tensor_pp.to(DEVICE)
+def infer(video_file):
+    frames = parse_video(video_file)
+    video_tensor = preprocess_video(frames)
+    inputs = {"pixel_values": video_tensor}
+    # forward pass
+    with torch.no_grad():
+        outputs = MODEL(**inputs)
+        logits = outputs.logits
+    softmax_scores = torch.nn.functional.softmax(logits, dim=-1).squeeze(0)
+    confidences = {LABELS[i]: float(softmax_scores[i]) for i in range(len(LABELS))}
+    return confidences
+gr.Interface(
+    fn=infer,
+    inputs=gr.Video(type="file"),
+    outputs=gr.Label(num_top_classes=7),
+    examples=[
+        ["examples/bend.mp4"],
+        ["examples/cnw.mp4"],
+        ["examples/lift.mp4"],
+    ],
+    title="VideoMAE IPM",
+    description=(
+        "Gradio demo for VideoMAE for video classification. To use it, simply upload your video or click one of the"
+        " examples to load them. Read more at the links below."
+    ),
+    article=(
+        "<div style='text-align: center;'><a href='https://huggingface.co/docs/transformers/model_doc/videomae' target='_blank'>VideoMAE</a>"
+        " <center><a href='https://huggingface.co/rickysk/videomae-base-ipm_all_videos' target='_blank'>Fine-tuned Model</a></center></div>"
+    ),
+    allow_flagging=False,
+    allow_screenshot=False,
+).launch()

examples/20230704_112128.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50c7f29dd5e61bdbe075c414e2d568858ae0c27cf9f9746174bbaef265366bcf
+size 31218451

examples/20230704_112231.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd27fdb9f9821b7c97977799c54510e10f551909d6b9dfa39a21683cc39d3d2f
+size 25011479

examples/20230704_112517.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ebc272367ebaf9f5644e5c9ed7121530b735a5f018565b6cfb14ac3cfae1ddd9
+size 21237452

examples/20230704_112544.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb7f42505ff71e21900ebae99daa3b2f1c1fa9314dce1d22ec45617ad569e9e7
+size 23387650

examples/20230704_112630.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d64536bae42eb93001f70386029b1b6c017880529dfd17a736041b22ac1156c4
+size 22199976

examples/20230704_112700.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45acda04b126f6bf8b109a2fd0723bbacba8021d3ea88d2795ed1f979b994e7b
+size 15992441

examples/bend.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a494836def513d0ac9522e0ba8960a241189b1303089384d3abb01579b54b185
+size 8884428

examples/cnw.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1867270b75db4e803b37b74037891db0666509ce5ca3aad00c729f13b38c100f
+size 12356306

examples/lift.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:749a7cd0e64837f22597b4f6d0f668f8f62d66e87fcf6c05e0a3a6f049ff9c58
+size 12535819

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+opencv-python
+imutils
+numpy
+torch
+torchvision
+pytorchvideo
+transformers