|
|
import gradio as gr |
|
|
import cv2 |
|
|
import numpy as np |
|
|
from gradio_webrtc import WebRTC |
|
|
from twilio.rest import Client |
|
|
import os |
|
|
import spaces |
|
|
from threading import Lock |
|
|
from collections import defaultdict |
|
|
import time |
|
|
from bisect import bisect_left |
|
|
from scipy.spatial.distance import cdist |
|
|
import mediapipe as mp |
|
|
from mediapipe.tasks import python |
|
|
from mediapipe.tasks.python import vision |
|
|
from mediapipe.framework.formats import landmark_pb2 |
|
|
from mediapipe import solutions |
|
|
import pdb |
|
|
import json |
|
|
from moviepy.editor import VideoFileClip |
|
|
import librosa |
|
|
|
|
|
|
|
|
base_options = mp.tasks.BaseOptions(model_asset_path='gesture_recognizer.task', |
|
|
delegate=mp.tasks.BaseOptions.Delegate.GPU) |
|
|
|
|
|
mp_drawing = mp.solutions.drawing_utils |
|
|
mp_hands = mp.solutions.hands |
|
|
|
|
|
base_options=mp.tasks.BaseOptions(model_asset_path='hand_landmarker.task', |
|
|
delegate=mp.tasks.BaseOptions.Delegate.GPU) |
|
|
options = vision.HandLandmarkerOptions(base_options=base_options, |
|
|
running_mode=mp.tasks.vision.RunningMode.VIDEO, |
|
|
num_hands=2) |
|
|
detector = vision.HandLandmarker.create_from_options(options) |
|
|
|
|
|
options_image = vision.HandLandmarkerOptions(base_options=base_options, |
|
|
running_mode=mp.tasks.vision.RunningMode.IMAGE, |
|
|
num_hands=2) |
|
|
detector_image = vision.HandLandmarker.create_from_options(options_image) |
|
|
|
|
|
video_size = (500, 500) |
|
|
|
|
|
previous_timestamp = None |
|
|
|
|
|
class ReferenceVideo: |
|
|
def __init__(self): |
|
|
self.keypoints = {"Left": [], "Right": []} |
|
|
|
|
|
|
|
|
self.frames = [] |
|
|
|
|
|
def load_video(self, video_path): |
|
|
global previous_timestamp |
|
|
self.keypoints = {"Left": [], "Right": []} |
|
|
self.frames = [] |
|
|
|
|
|
video = VideoFileClip(video_path) |
|
|
fps = video.fps |
|
|
video_size = (video.size[0], video.size[1]) |
|
|
audio = video.audio.to_soundarray() |
|
|
original_sr = video.audio.fps |
|
|
audio = librosa.resample(audio.T, orig_sr=original_sr, target_sr=48000).T |
|
|
|
|
|
if audio.ndim == 2 and audio.shape[1] == 2: |
|
|
audio = 0.5 * (audio[:, 0] + audio[:, 1]) |
|
|
audio = audio.astype(np.float32) |
|
|
|
|
|
cap = cv2.VideoCapture(video_path) |
|
|
while cap.isOpened(): |
|
|
ret, frame = cap.read() |
|
|
if not ret: |
|
|
break |
|
|
|
|
|
|
|
|
rgb_data = frame.astype(np.uint8).copy() |
|
|
rgb = mp.Image(image_format=mp.ImageFormat.SRGB,data=np.array(cv2.cvtColor(frame,cv2.COLOR_BGR2RGB))) |
|
|
|
|
|
|
|
|
timestamp_ms = int(cap.get(cv2.CAP_PROP_POS_MSEC)) |
|
|
while previous_timestamp is not None and timestamp_ms <= previous_timestamp: |
|
|
timestamp_ms = previous_timestamp + 1 |
|
|
previous_timestamp = timestamp_ms |
|
|
|
|
|
|
|
|
results = detector.detect_for_video(rgb, timestamp_ms) |
|
|
|
|
|
|
|
|
frame_landmarks = {"Left": None, "Right": None} |
|
|
if results.hand_landmarks and results.handedness: |
|
|
for idx, hand_landmarks in enumerate(results.hand_landmarks): |
|
|
label = results.handedness[idx][0].category_name |
|
|
landmarks = [(lm.x, lm.y, lm.z) for lm in hand_landmarks] |
|
|
frame_landmarks[label] = landmarks |
|
|
|
|
|
self.keypoints["Left"].append(frame_landmarks["Left"]) |
|
|
self.keypoints["Right"].append(frame_landmarks["Right"]) |
|
|
|
|
|
self.frames.append(cv2.resize(frame, video_size)) |
|
|
output_path = os.path.splitext(video_path)[0] + "_keypoints.json" |
|
|
with open(output_path, "w") as f: |
|
|
json.dump(self.keypoints, f) |
|
|
|
|
|
|
|
|
del results |
|
|
del rgb |
|
|
|
|
|
|
|
|
cap.release() |
|
|
video.close() |
|
|
|
|
|
|
|
|
|
|
|
metadata = {"fps": fps, "video_size": video_size} |
|
|
with open(f"{os.path.splitext(video_path)[0]}_meta.json", "w") as f: |
|
|
json.dump(metadata, f) |
|
|
|
|
|
ref_video = ReferenceVideo() |
|
|
|
|
|
video_paths = ['predefined/Move12_preview.mp4', 'predefined/Move12_main.mp4'] |
|
|
|
|
|
for video_path in video_paths: |
|
|
ref_video.load_video(video_path) |