Spaces:

linoyts
/

video-ViTPose-extractor

Paused

App Files Files Community

linoyts HF Staff commited on Feb 18

Commit

ed65396

verified ·

1 Parent(s): 11c1efe

Update app.py

Browse files

Files changed (1) hide show

app.py +210 -53

app.py CHANGED Viewed

@@ -8,56 +8,137 @@ from transformers import AutoProcessor, VitPoseForPoseEstimation, RTDetrForObjec
 from PIL import Image
 import torch
-# COCO keypoint connections for skeleton visualization
-SKELETON_EDGES = [
     (0, 1), (0, 2), (1, 3), (2, 4),  # head
     (5, 6), (5, 7), (7, 9), (6, 8), (8, 10),  # arms
     (5, 11), (6, 12), (11, 12),  # torso
     (11, 13), (13, 15), (12, 14), (14, 16)  # legs
 ]
-KEYPOINT_COLORS = [
     (255, 0, 0), (255, 85, 0), (255, 170, 0), (255, 255, 0),
     (170, 255, 0), (85, 255, 0), (0, 255, 0), (0, 255, 85),
     (0, 255, 170), (0, 255, 255), (0, 170, 255), (0, 85, 255),
     (0, 0, 255), (85, 0, 255), (170, 0, 255), (255, 0, 255), (255, 0, 170)
 ]
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load models
 print("Loading models...")
 person_detector = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365").to(device)
 person_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
-vitpose_model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple").to(device)
-vitpose_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple")
-print("Models loaded!")
-def detect_person(image, processor, model):
-    """Detect person bounding box in image."""
-    inputs = processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
-        outputs = model(**inputs)
-    results = processor.post_process_object_detection(
         outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
     )
-    # Find person detections (class 0 in COCO)
     boxes = []
     for result in results:
         for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
-            if label.item() == 0:  # person class
                 boxes.append(box.cpu().numpy())
     return boxes if boxes else None
 @spaces.GPU
-def estimate_pose(image, boxes, processor, model):
-    """Estimate pose keypoints for detected persons."""
     inputs = processor(images=image, boxes=[boxes], return_tensors="pt").to(device)
     with torch.no_grad():
         outputs = model(**inputs)
@@ -65,35 +146,90 @@ def estimate_pose(image, boxes, processor, model):
     pose_results = processor.post_process_pose_estimation(outputs, boxes=[boxes])
     return pose_results[0] if pose_results else None
-def draw_skeleton(frame, keypoints, scores, threshold=0.3):
-    """Draw skeleton on a black background."""
     h, w = frame.shape[:2]
     skeleton_frame = np.zeros((h, w, 3), dtype=np.uint8)
     if keypoints is None:
-        return skeleton_frame
-    # Draw edges
-    for start_idx, end_idx in SKELETON_EDGES:
-        if scores[start_idx] > threshold and scores[end_idx] > threshold:
-            start_point = (int(keypoints[start_idx][0]), int(keypoints[start_idx][1]))
-            end_point = (int(keypoints[end_idx][0]), int(keypoints[end_idx][1]))
-            color = KEYPOINT_COLORS[start_idx % len(KEYPOINT_COLORS)]
-            cv2.line(skeleton_frame, start_point, end_point, color, 3)
-    # Draw keypoints
     for i, (kp, score) in enumerate(zip(keypoints, scores)):
         if score > threshold:
             x, y = int(kp[0]), int(kp[1])
-            color = KEYPOINT_COLORS[i % len(KEYPOINT_COLORS)]
             cv2.circle(skeleton_frame, (x, y), 6, color, -1)
             cv2.circle(skeleton_frame, (x, y), 8, (255, 255, 255), 2)
-    return skeleton_frame
 @spaces.GPU
-def process_video(video_path, progress=gr.Progress()):
     """Process video to extract skeleton frames and first frame."""
     if video_path is None:
         return None, None
@@ -104,7 +240,6 @@ def process_video(video_path, progress=gr.Progress()):
     width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    # Temp output for skeleton video
     temp_dir = tempfile.mkdtemp()
     skeleton_path = os.path.join(temp_dir, "skeleton.mp4")
     first_frame_path = os.path.join(temp_dir, "first_frame.png")
@@ -115,6 +250,10 @@ def process_video(video_path, progress=gr.Progress()):
     first_frame_saved = False
     frame_idx = 0
     while True:
         ret, frame = cap.read()
         if not ret:
@@ -122,34 +261,25 @@ def process_video(video_path, progress=gr.Progress()):
         progress((frame_idx + 1) / total_frames, desc=f"Processing frame {frame_idx + 1}/{total_frames}")
-        # Convert BGR to RGB for PIL
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
         pil_image = Image.fromarray(frame_rgb)
-        # Detect person
-        boxes = detect_person(pil_image, person_processor, person_detector)
         if boxes is not None:
-            # Save first frame with person
-            if not first_frame_saved:
-                cv2.imwrite(first_frame_path, frame)
-                first_frame_saved = True
-            # Estimate pose
-            pose_results = estimate_pose(pil_image, boxes, vitpose_processor, vitpose_model)
             if pose_results:
-                # Use first person's pose
                 keypoints = pose_results[0]["keypoints"].cpu().numpy()
                 scores = pose_results[0]["scores"].cpu().numpy()
-                skeleton_frame = draw_skeleton(frame, keypoints, scores)
-            else:
-                skeleton_frame = np.zeros((height, width, 3), dtype=np.uint8)
-        else:
-            skeleton_frame = np.zeros((height, width, 3), dtype=np.uint8)
-            if not first_frame_saved:
-                cv2.imwrite(first_frame_path, frame)
-                first_frame_saved = True
         out.write(skeleton_frame)
         frame_idx += 1
@@ -157,24 +287,51 @@ def process_video(video_path, progress=gr.Progress()):
     cap.release()
     out.release()
     return skeleton_path, first_frame_path if first_frame_saved else None
 with gr.Blocks() as demo:
     gr.Markdown("## ViTPose Skeleton Extractor")
     with gr.Row():
         video_input = gr.Video(label="Input Video")
     process_btn = gr.Button("Extract Skeleton", variant="primary")
     with gr.Row():
         skeleton_output = gr.Video(label="Skeleton Frames", interactive=True)
         first_frame_output = gr.Image(label="First Frame (Reference)", interactive=True)
     process_btn.click(
         fn=process_video,
-        inputs=video_input,
         outputs=[skeleton_output, first_frame_output]
     )

 from PIL import Image
 import torch
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# ============== SKELETON DEFINITIONS ==============
+# Simple model: 17 COCO keypoints
+SIMPLE_EDGES = [
     (0, 1), (0, 2), (1, 3), (2, 4),  # head
     (5, 6), (5, 7), (7, 9), (6, 8), (8, 10),  # arms
     (5, 11), (6, 12), (11, 12),  # torso
     (11, 13), (13, 15), (12, 14), (14, 16)  # legs
 ]
+SIMPLE_COLORS = [
     (255, 0, 0), (255, 85, 0), (255, 170, 0), (255, 255, 0),
     (170, 255, 0), (85, 255, 0), (0, 255, 0), (0, 255, 85),
     (0, 255, 170), (0, 255, 255), (0, 170, 255), (0, 85, 255),
     (0, 0, 255), (85, 0, 255), (170, 0, 255), (255, 0, 255), (255, 0, 170)
 ]
+# WholeBody model: 133 keypoints
+# 0-16: body, 17-22: feet, 23-90: face, 91-111: left hand, 112-132: right hand
+BODY_EDGES = [
+    (0, 1), (0, 2), (1, 3), (2, 4),
+    (5, 6), (5, 7), (7, 9), (6, 8), (8, 10),
+    (5, 11), (6, 12), (11, 12),
+    (11, 13), (13, 15), (12, 14), (14, 16)
+]
+FEET_EDGES = [
+    (15, 17), (17, 18), (18, 19),
+    (16, 20), (20, 21), (21, 22)
+]
+# Face edges
+FACE_EDGES = []
+for i in range(16):  # Jaw
+    FACE_EDGES.append((23 + i, 23 + i + 1))
+for i in range(4):  # Left eyebrow
+    FACE_EDGES.append((40 + i, 40 + i + 1))
+for i in range(4):  # Right eyebrow
+    FACE_EDGES.append((45 + i, 45 + i + 1))
+for i in range(3):  # Nose bridge
+    FACE_EDGES.append((50 + i, 50 + i + 1))
+for i in range(4):  # Nose bottom
+    FACE_EDGES.append((54 + i, 54 + i + 1))
+for i in range(5):  # Left eye
+    FACE_EDGES.append((59 + i, 59 + i + 1))
+FACE_EDGES.append((64, 59))
+for i in range(5):  # Right eye
+    FACE_EDGES.append((65 + i, 65 + i + 1))
+FACE_EDGES.append((70, 65))
+for i in range(11):  # Outer lip
+    FACE_EDGES.append((71 + i, 71 + i + 1))
+FACE_EDGES.append((82, 71))
+for i in range(7):  # Inner lip
+    FACE_EDGES.append((83 + i, 83 + i + 1))
+FACE_EDGES.append((90, 83))
+def get_hand_edges(start_idx):
+    edges = []
+    edges.append((start_idx, start_idx + 1))
+    edges.append((start_idx, start_idx + 5))
+    edges.append((start_idx, start_idx + 9))
+    edges.append((start_idx, start_idx + 13))
+    edges.append((start_idx, start_idx + 17))
+    for finger_start in [1, 5, 9, 13, 17]:
+        for i in range(3):
+            edges.append((start_idx + finger_start + i, start_idx + finger_start + i + 1))
+    return edges
+LEFT_HAND_EDGES = get_hand_edges(91)
+RIGHT_HAND_EDGES = get_hand_edges(112)
+WHOLEBODY_COLORS = {
+    'body': (0, 255, 255),
+    'face': (255, 255, 255),
+    'left_hand': (0, 255, 0),
+    'right_hand': (0, 0, 255),
+    'feet': (255, 0, 255),
+}
+# ============== LOAD MODELS ==============
 print("Loading models...")
+# Person detector (shared)
 person_detector = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365").to(device)
 person_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
+# Simple ViTPose (17 keypoints)
+vitpose_simple = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple").to(device)
+vitpose_simple_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple")
+# WholeBody ViTPose (133 keypoints)
+vitpose_wholebody = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-plus-huge-wholebody").to(device)
+vitpose_wholebody_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-plus-huge-wholebody")
+print("All models loaded!")
+# ============== DETECTION & POSE FUNCTIONS ==============
+def detect_person(image):
+    """Detect person bounding boxes in image."""
+    inputs = person_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
+        outputs = person_detector(**inputs)
+    results = person_processor.post_process_object_detection(
         outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
     )
     boxes = []
     for result in results:
         for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
+            if label.item() == 0:
                 boxes.append(box.cpu().numpy())
     return boxes if boxes else None
 @spaces.GPU
+def estimate_pose(image, boxes, model_choice):
+    """Estimate pose keypoints using selected model."""
+    if model_choice == "Simple (17 keypoints)":
+        processor = vitpose_simple_processor
+        model = vitpose_simple
+    else:
+        processor = vitpose_wholebody_processor
+        model = vitpose_wholebody
     inputs = processor(images=image, boxes=[boxes], return_tensors="pt").to(device)
     with torch.no_grad():
         outputs = model(**inputs)
     pose_results = processor.post_process_pose_estimation(outputs, boxes=[boxes])
     return pose_results[0] if pose_results else None
+# ============== DRAWING FUNCTIONS ==============
+def draw_simple_skeleton(frame, keypoints, scores, threshold=0.3):
+    """Draw 17-keypoint skeleton."""
     h, w = frame.shape[:2]
     skeleton_frame = np.zeros((h, w, 3), dtype=np.uint8)
     if keypoints is None:
+        return skeleton_frame, False
+    has_valid = False
+    for start_idx, end_idx in SIMPLE_EDGES:
+        if start_idx < len(scores) and end_idx < len(scores):
+            if scores[start_idx] > threshold and scores[end_idx] > threshold:
+                start_point = (int(keypoints[start_idx][0]), int(keypoints[start_idx][1]))
+                end_point = (int(keypoints[end_idx][0]), int(keypoints[end_idx][1]))
+                color = SIMPLE_COLORS[start_idx % len(SIMPLE_COLORS)]
+                cv2.line(skeleton_frame, start_point, end_point, color, 3)
+                has_valid = True
     for i, (kp, score) in enumerate(zip(keypoints, scores)):
         if score > threshold:
             x, y = int(kp[0]), int(kp[1])
+            color = SIMPLE_COLORS[i % len(SIMPLE_COLORS)]
             cv2.circle(skeleton_frame, (x, y), 6, color, -1)
             cv2.circle(skeleton_frame, (x, y), 8, (255, 255, 255), 2)
+    return skeleton_frame, has_valid
+def draw_edges(frame, keypoints, scores, edges, color, threshold=0.3, thickness=2):
+    """Draw edges for a set of connections."""
+    for start_idx, end_idx in edges:
+        if start_idx < len(scores) and end_idx < len(scores):
+            if scores[start_idx] > threshold and scores[end_idx] > threshold:
+                start_point = (int(keypoints[start_idx][0]), int(keypoints[start_idx][1]))
+                end_point = (int(keypoints[end_idx][0]), int(keypoints[end_idx][1]))
+                cv2.line(frame, start_point, end_point, color, thickness)
+def draw_keypoints(frame, keypoints, scores, indices, color, threshold=0.3, radius=3):
+    """Draw keypoints for specified indices."""
+    for idx in indices:
+        if idx < len(scores) and scores[idx] > threshold:
+            x, y = int(keypoints[idx][0]), int(keypoints[idx][1])
+            cv2.circle(frame, (x, y), radius, color, -1)
+def draw_wholebody_skeleton(frame, keypoints, scores, threshold=0.3):
+    """Draw 133-keypoint skeleton with color coding."""
+    h, w = frame.shape[:2]
+    skeleton_frame = np.zeros((h, w, 3), dtype=np.uint8)
+    if keypoints is None:
+        return skeleton_frame, False
+    # Body
+    draw_edges(skeleton_frame, keypoints, scores, BODY_EDGES, WHOLEBODY_COLORS['body'], threshold, 3)
+    draw_keypoints(skeleton_frame, keypoints, scores, range(17), WHOLEBODY_COLORS['body'], threshold, 5)
+    # Feet
+    draw_edges(skeleton_frame, keypoints, scores, FEET_EDGES, WHOLEBODY_COLORS['feet'], threshold, 2)
+    draw_keypoints(skeleton_frame, keypoints, scores, range(17, 23), WHOLEBODY_COLORS['feet'], threshold, 3)
+    # Face
+    draw_edges(skeleton_frame, keypoints, scores, FACE_EDGES, WHOLEBODY_COLORS['face'], threshold, 1)
+    # Hands
+    draw_edges(skeleton_frame, keypoints, scores, LEFT_HAND_EDGES, WHOLEBODY_COLORS['left_hand'], threshold, 2)
+    draw_keypoints(skeleton_frame, keypoints, scores, range(91, 112), WHOLEBODY_COLORS['left_hand'], threshold, 2)
+    draw_edges(skeleton_frame, keypoints, scores, RIGHT_HAND_EDGES, WHOLEBODY_COLORS['right_hand'], threshold, 2)
+    draw_keypoints(skeleton_frame, keypoints, scores, range(112, 133), WHOLEBODY_COLORS['right_hand'], threshold, 2)
+    body_scores = scores[:17] if len(scores) >= 17 else scores
+    has_valid = np.sum(body_scores > threshold) >= 5
+    return skeleton_frame, has_valid
+# ============== MAIN PROCESSING ==============
 @spaces.GPU
+def process_video(video_path, model_choice, progress=gr.Progress()):
     """Process video to extract skeleton frames and first frame."""
     if video_path is None:
         return None, None
     width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     temp_dir = tempfile.mkdtemp()
     skeleton_path = os.path.join(temp_dir, "skeleton.mp4")
     first_frame_path = os.path.join(temp_dir, "first_frame.png")
     first_frame_saved = False
     frame_idx = 0
+    # Select drawing function based on model
+    use_wholebody = model_choice == "WholeBody (133 keypoints)"
+    draw_fn = draw_wholebody_skeleton if use_wholebody else draw_simple_skeleton
     while True:
         ret, frame = cap.read()
         if not ret:
         progress((frame_idx + 1) / total_frames, desc=f"Processing frame {frame_idx + 1}/{total_frames}")
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
         pil_image = Image.fromarray(frame_rgb)
+        boxes = detect_person(pil_image)
+        skeleton_frame = np.zeros((height, width, 3), dtype=np.uint8)
+        person_detected = False
         if boxes is not None:
+            pose_results = estimate_pose(pil_image, boxes, model_choice)
             if pose_results:
                 keypoints = pose_results[0]["keypoints"].cpu().numpy()
                 scores = pose_results[0]["scores"].cpu().numpy()
+                skeleton_frame, person_detected = draw_fn(frame, keypoints, scores)
+        if person_detected and not first_frame_saved:
+            cv2.imwrite(first_frame_path, frame)
+            first_frame_saved = True
         out.write(skeleton_frame)
         frame_idx += 1
     cap.release()
     out.release()
+    if not first_frame_saved:
+        cap = cv2.VideoCapture(video_path)
+        ret, frame = cap.read()
+        if ret:
+            cv2.imwrite(first_frame_path, frame)
+            first_frame_saved = True
+        cap.release()
     return skeleton_path, first_frame_path if first_frame_saved else None
+# ============== GRADIO UI ==============
 with gr.Blocks() as demo:
     gr.Markdown("## ViTPose Skeleton Extractor")
+    gr.Markdown("Choose between fast 17-keypoint extraction or detailed 133-keypoint wholebody extraction")
     with gr.Row():
         video_input = gr.Video(label="Input Video")
+    with gr.Row():
+        model_choice = gr.Radio(
+            choices=["Simple (17 keypoints)", "WholeBody (133 keypoints)"],
+            value="Simple (17 keypoints)",
+            label="Model Selection",
+            info="WholeBody includes hands, face & feet but is slower"
+        )
     process_btn = gr.Button("Extract Skeleton", variant="primary")
     with gr.Row():
         skeleton_output = gr.Video(label="Skeleton Frames", interactive=True)
         first_frame_output = gr.Image(label="First Frame (Reference)", interactive=True)
+    with gr.Accordion("Color Legend (WholeBody mode)", open=False):
+        gr.Markdown("""
+        - 🔵 **Cyan**: Body (17 keypoints)
+        - 🟣 **Magenta**: Feet (6 keypoints)
+        - ⚪ **White**: Face (68 keypoints)
+        - 🟢 **Green**: Left hand (21 keypoints)
+        - 🔴 **Red**: Right hand (21 keypoints)
+        """)
     process_btn.click(
         fn=process_video,
+        inputs=[video_input, model_choice],
         outputs=[skeleton_output, first_frame_output]
     )