Spaces:

James040
/

Pose-Extractor-Video-MP

Sleeping

App Files Files Community

James040 commited on Apr 4

Commit

549107e

verified ·

1 Parent(s): 36f5403

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -41

app.py CHANGED Viewed

@@ -4,13 +4,14 @@ import gradio as gr
 import subprocess
 import urllib.request
 import os
-# 1. Use the Modern Tasks API
 import mediapipe as mp
 from mediapipe.tasks import python
 from mediapipe.tasks.python import vision
-# Auto-Download the Pose Model for the CPU
 MODEL_PATH = "pose_landmarker_lite.task"
 MODEL_URL = "https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_lite/float16/1/pose_landmarker_lite.task"
@@ -18,7 +19,6 @@ if not os.path.exists(MODEL_PATH):
     print("Downloading MediaPipe Pose Model...")
     urllib.request.urlretrieve(MODEL_URL, MODEL_PATH)
-# 2. Hardcode Skeleton Connections (Bypassing the broken drawing_utils)
 POSE_CONNECTIONS = [
     (0, 1), (1, 2), (2, 3), (3, 7), (0, 4), (4, 5), (5, 6), (6, 8), (9, 10),
     (11, 12), (11, 13), (13, 15), (15, 17), (15, 19), (15, 21), (17, 19),
@@ -27,12 +27,13 @@ POSE_CONNECTIONS = [
     (28, 30), (29, 31), (30, 32), (27, 31), (28, 32)
 ]
-def extract_pose(video_path):
     if video_path is None:
-        return None
-    output_path = "final_output.mp4"
     temp_video = "temp_silent.mp4"
     cap = cv2.VideoCapture(video_path)
     width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
@@ -42,13 +43,15 @@ def extract_pose(video_path):
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
     out = cv2.VideoWriter(temp_video, fourcc, fps, (width, height))
-    # 3. Configure Tasks API for Video Processing
     base_options = python.BaseOptions(model_asset_path=MODEL_PATH)
     options = vision.PoseLandmarkerOptions(
         base_options=base_options,
         running_mode=vision.RunningMode.VIDEO
     )
     with vision.PoseLandmarker.create_from_options(options) as landmarker:
         frame_idx = 0
         while cap.isOpened():
@@ -56,64 +59,91 @@ def extract_pose(video_path):
             if not ret:
                 break
-            # Format frame for Tasks API
             rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)
-            # Strict timestamp required for video mode
             timestamp_ms = int((frame_idx / fps) * 1000)
-            # Run Inference
             result = landmarker.detect_for_video(mp_image, timestamp_ms)
-            # Pure Black Canvas
             canvas = np.zeros((height, width, 3), dtype=np.uint8)
-            # 4. Draw Meaty Lines Manually
-            if result.pose_landmarks:
-                for pose in result.pose_landmarks:
-                    # Draw Thick Green Bones
-                    for connection in POSE_CONNECTIONS:
-                        start_idx, end_idx = connection
-                        start_pt = pose[start_idx]
-                        end_pt = pose[end_idx]
-                        start_px = (int(start_pt.x * width), int(start_pt.y * height))
-                        end_px = (int(end_pt.x * width), int(end_pt.y * height))
-                        cv2.line(canvas, start_px, end_px, (0, 255, 0), 10)
-                    # Draw Large White Joints
-                    for landmark in pose:
-                        px = (int(landmark.x * width), int(landmark.y * height))
-                        cv2.circle(canvas, px, 15, (255, 255, 255), -1)
             out.write(canvas)
             frame_idx += 1
     cap.release()
     out.release()
     # Merge Audio Native FFmpeg
     try:
         command = [
             "ffmpeg", "-y", "-i", temp_video, "-i", video_path,
             "-c:v", "copy", "-c:a", "aac", "-map", "0:v:0", "-map", "1:a:0?",
-            "-shortest", output_path
         ]
         subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        return output_path
     except Exception as e:
         print("FFmpeg error:", e)
-        return temp_video
-# UI Setup
-interface = gr.Interface(
-    fn=extract_pose,
-    inputs=gr.Video(label="Upload Dancing Clip (15-30s)"),
-    outputs=gr.Video(label="Meaty Stickman Output"),
-    title="Tasks API Pose Extractor",
-    description="Uses modern MediaPipe Tasks to generate thick tracking lines for EbSynth."
-)
 if __name__ == "__main__":
     interface.launch()

 import subprocess
 import urllib.request
 import os
+import json
+# 1. Modern Tasks API
 import mediapipe as mp
 from mediapipe.tasks import python
 from mediapipe.tasks.python import vision
+# Auto-Download Model
 MODEL_PATH = "pose_landmarker_lite.task"
 MODEL_URL = "https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_lite/float16/1/pose_landmarker_lite.task"
     print("Downloading MediaPipe Pose Model...")
     urllib.request.urlretrieve(MODEL_URL, MODEL_PATH)
 POSE_CONNECTIONS = [
     (0, 1), (1, 2), (2, 3), (3, 7), (0, 4), (4, 5), (5, 6), (6, 8), (9, 10),
     (11, 12), (11, 13), (13, 15), (15, 17), (15, 19), (15, 21), (17, 19),
     (28, 30), (29, 31), (30, 32), (27, 31), (28, 32)
 ]
+def extract_pose_and_data(video_path):
     if video_path is None:
+        return None, None, None
+    output_video_path = "final_output.mp4"
     temp_video = "temp_silent.mp4"
+    output_json_path = "pose_data.json"
     cap = cv2.VideoCapture(video_path)
     width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
     out = cv2.VideoWriter(temp_video, fourcc, fps, (width, height))
     base_options = python.BaseOptions(model_asset_path=MODEL_PATH)
     options = vision.PoseLandmarkerOptions(
         base_options=base_options,
         running_mode=vision.RunningMode.VIDEO
     )
+    # Storage for Blender Data
+    all_frames_data = []
     with vision.PoseLandmarker.create_from_options(options) as landmarker:
         frame_idx = 0
         while cap.isOpened():
             if not ret:
                 break
             rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)
             timestamp_ms = int((frame_idx / fps) * 1000)
             result = landmarker.detect_for_video(mp_image, timestamp_ms)
             canvas = np.zeros((height, width, 3), dtype=np.uint8)
+            frame_entry = {
+                "frame": frame_idx,
+                "timestamp_ms": timestamp_ms,
+                "landmarks": []
+            }
+            if result.pose_landmarks and result.pose_world_landmarks:
+                # 1. Extract 3D World Data for JSON (For Blender)
+                for landmark in result.pose_world_landmarks[0]:
+                    frame_entry["landmarks"].append({
+                        "x": landmark.x,
+                        "y": landmark.y,
+                        "z": landmark.z,
+                        "visibility": landmark.visibility
+                    })
+                # 2. Draw 2D Data for Video (For EbSynth)
+                pose = result.pose_landmarks[0]
+                for connection in POSE_CONNECTIONS:
+                    start_idx, end_idx = connection
+                    start_pt, end_pt = pose[start_idx], pose[end_idx]
+                    start_px = (int(start_pt.x * width), int(start_pt.y * height))
+                    end_px = (int(end_pt.x * width), int(end_pt.y * height))
+                    cv2.line(canvas, start_px, end_px, (0, 255, 0), 10)
+                for landmark in pose:
+                    px = (int(landmark.x * width), int(landmark.y * height))
+                    cv2.circle(canvas, px, 15, (255, 255, 255), -1)
+            all_frames_data.append(frame_entry)
             out.write(canvas)
             frame_idx += 1
     cap.release()
     out.release()
+    # Save the JSON file
+    with open(output_json_path, 'w') as f:
+        json.dump(all_frames_data, f, indent=4)
     # Merge Audio Native FFmpeg
     try:
         command = [
             "ffmpeg", "-y", "-i", temp_video, "-i", video_path,
             "-c:v", "copy", "-c:a", "aac", "-map", "0:v:0", "-map", "1:a:0?",
+            "-shortest", output_video_path
         ]
         subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     except Exception as e:
         print("FFmpeg error:", e)
+        output_video_path = temp_video
+    # Return: Video File, JSON File (for download), JSON Dictionary (for UI Copying)
+    return output_video_path, output_json_path, all_frames_data
+# Gradio UI Setup
+with gr.Blocks(title="Pose & 3D Data Extractor") as interface:
+    gr.Markdown("# 🕺 Pose Video & 3D JSON Extractor")
+    gr.Markdown("Generates a thick stickman for EbSynth and extracts `pose_world_landmarks` (x, y, z) for Blender IK.")
+    with gr.Row():
+        with gr.Column():
+            video_input = gr.Video(label="Upload Dancing Clip (15-30s)")
+            submit_btn = gr.Button("Extract Pose & Data", variant="primary")
+        with gr.Column():
+            video_output = gr.Video(label="Meaty Stickman Output")
+            file_output = gr.File(label="Download 3D JSON Data")
+    with gr.Row():
+        # The gr.JSON component automatically includes a "Copy" button in the top right
+        json_output = gr.JSON(label="Raw JSON Data (Click top right to Copy)")
+    submit_btn.click(
+        fn=extract_pose_and_data,
+        inputs=video_input,
+        outputs=[video_output, file_output, json_output]
+    )
 if __name__ == "__main__":
     interface.launch()