Spaces:

seesaw112233
/

pose-estimation

Sleeping

App Files Files Community

seesaw112233 commited on Dec 25, 2025

Commit

1e8a048

verified ·

1 Parent(s): 8d2db9a

Update app.py

Browse files

Files changed (1) hide show

app.py +241 -151

app.py CHANGED Viewed

@@ -4,12 +4,45 @@ import json
 import tempfile
 from dataclasses import dataclass
 from typing import Dict, List, Tuple, Optional
 import cv2
 import numpy as np
 import pandas as pd
 import gradio as gr
 import mediapipe as mp
 # -------------------------
@@ -53,70 +86,87 @@ def angle_3pts(a: np.ndarray, b: np.ndarray, c: np.ndarray) -> Optional[float]:
 # -------------------------
 # MediaPipe indices
 # -------------------------
-# FaceMesh landmarks for EAR (common set)
 LEFT_EYE_EAR_IDX  = [33, 160, 158, 133, 153, 144]
 RIGHT_EYE_EAR_IDX = [362, 385, 387, 263, 373, 380]
-# Pose landmark enum mapping (MediaPipe Pose)
-POSE = mp.solutions.pose
-POSE_LM = POSE.PoseLandmark
-# Key joints for limb movement/angles
-JOINTS = {
-    "left_wrist": POSE_LM.LEFT_WRIST.value,
-    "right_wrist": POSE_LM.RIGHT_WRIST.value,
-    "left_ankle": POSE_LM.LEFT_ANKLE.value,
-    "right_ankle": POSE_LM.RIGHT_ANKLE.value,
-    "left_shoulder": POSE_LM.LEFT_SHOULDER.value,
-    "right_shoulder": POSE_LM.RIGHT_SHOULDER.value,
-    "left_elbow": POSE_LM.LEFT_ELBOW.value,
-    "right_elbow": POSE_LM.RIGHT_ELBOW.value,
-    "left_hip": POSE_LM.LEFT_HIP.value,
-    "right_hip": POSE_LM.RIGHT_HIP.value,
-    "left_knee": POSE_LM.LEFT_KNEE.value,
-    "right_knee": POSE_LM.RIGHT_KNEE.value,
 }
 # -------------------------
-# Drawing
 # -------------------------
-mp_drawing = mp.solutions.drawing_utils
-mp_drawing_styles = mp.solutions.drawing_styles
-mp_face_mesh = mp.solutions.face_mesh
-def draw_pose(image_bgr, pose_results):
-    if pose_results.pose_landmarks:
-        mp_drawing.draw_landmarks(
-            image_bgr,
-            pose_results.pose_landmarks,
-            POSE.POSE_CONNECTIONS,
-            landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style(),
-        )
-def draw_face(image_bgr, face_results, draw_full_mesh: bool = False):
-    if not face_results.multi_face_landmarks:
         return
-    for face_landmarks in face_results.multi_face_landmarks:
-        if draw_full_mesh:
-            # full mesh (dense) - heavier visually
-            mp_drawing.draw_landmarks(
-                image_bgr,
-                face_landmarks,
-                mp_face_mesh.FACEMESH_TESSELATION,
-                landmark_drawing_spec=None,
-                connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_tesselation_style(),
-            )
-        # contours are enough for most
         mp_drawing.draw_landmarks(
-            image_bgr,
-            face_landmarks,
-            mp_face_mesh.FACEMESH_CONTOURS,
             landmark_drawing_spec=None,
-            connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_contours_style(),
         )
 # -------------------------
@@ -135,7 +185,6 @@ def update_blink(state: BlinkState, ear: Optional[float], thr: float, min_consec
     - when ear goes back above => blink end (count once)
     """
     if ear is None:
-        # treat missing as no-update
         return state
     if ear < thr:
@@ -151,23 +200,25 @@ def update_blink(state: BlinkState, ear: Optional[float], thr: float, min_consec
 # -------------------------
-# Core processing
 # -------------------------
 def process_video(
     video_path: str,
-    pose_model_complexity: int = 1,
     min_pose_det_conf: float = 0.5,
     min_pose_track_conf: float = 0.5,
-    min_face_det_conf: float = 0.5,
     ear_threshold: float = 0.21,
     blink_min_consec: int = 2,
     draw_full_face_mesh: bool = False,
-    max_frames: int = 0,  # 0 => all
 ) -> Tuple[str, str, str, str]:
     """
-    Returns:
-      annotated_video_path, csv_path, json_path, report_md
     """
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
         raise RuntimeError("Cannot open video. Please upload a valid video file.")
@@ -179,7 +230,7 @@ def process_video(
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    # output paths
     tmpdir = tempfile.mkdtemp(prefix="mp_analysis_")
     out_video = os.path.join(tmpdir, "annotated.mp4")
     out_csv = os.path.join(tmpdir, "per_frame_metrics.csv")
@@ -189,23 +240,42 @@ def process_video(
     fourcc = cv2.VideoWriter_fourcc(*"mp4v")
     writer = cv2.VideoWriter(out_video, fourcc, fps, (width, height))
-    # MediaPipe init - using legacy API (works without model downloads)
-    with mp.solutions.pose.Pose(
-        static_image_mode=False,
-        model_complexity=pose_model_complexity,
-        enable_segmentation=False,
-        min_detection_confidence=min_pose_det_conf,
         min_tracking_confidence=min_pose_track_conf,
-    ) as pose, mp_face_mesh.FaceMesh(
-        static_image_mode=False,
-        max_num_faces=1,
-        refine_landmarks=True,  # improves eye landmarks
-        min_detection_confidence=min_face_det_conf,
-        min_tracking_confidence=min_face_det_conf,
-    ) as face_mesh:
         rows = []
-        prev_pts = {}  # for movement delta (normalized coordinates)
         left_blink = BlinkState()
         right_blink = BlinkState()
@@ -218,43 +288,53 @@ def process_video(
             if max_frames and frame_idx > max_frames:
                 break
             frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
-            pose_res = pose.process(frame_rgb)
-            face_res = face_mesh.process(frame_rgb)
-            # Extract face landmarks (pixel coords)
             face_pts: Dict[int, np.ndarray] = {}
-            if face_res.multi_face_landmarks:
-                lm = face_res.multi_face_landmarks[0].landmark
-                for i in range(len(lm)):
-                    face_pts[i] = np.array([lm[i].x * width, lm[i].y * height], dtype=np.float32)
-            # EAR
             left_ear = eye_aspect_ratio(face_pts, LEFT_EYE_EAR_IDX)
             right_ear = eye_aspect_ratio(face_pts, RIGHT_EYE_EAR_IDX)
             left_blink = update_blink(left_blink, left_ear, ear_threshold, blink_min_consec)
             right_blink = update_blink(right_blink, right_ear, ear_threshold, blink_min_consec)
-            # Extract pose landmarks (normalized coords + pixel)
             pose_norm: Dict[str, Optional[np.ndarray]] = {}
             pose_px: Dict[str, Optional[np.ndarray]] = {}
-            if pose_res.pose_landmarks:
-                lms = pose_res.pose_landmarks.landmark
-                for name, idx in JOINTS.items():
-                    if idx < len(lms):
-                        pose_norm[name] = np.array([lms[idx].x, lms[idx].y], dtype=np.float32)
-                        pose_px[name] = np.array([lms[idx].x * width, lms[idx].y * height], dtype=np.float32)
                     else:
                         pose_norm[name] = None
                         pose_px[name] = None
             else:
-                for name in JOINTS:
                     pose_norm[name] = None
                     pose_px[name] = None
-            # Limb movement: per-frame displacement & speed (in normalized units)
             def movement_metrics(key: str):
                 cur = pose_norm.get(key)
                 if cur is None:
@@ -273,7 +353,7 @@ def process_video(
             la_d, la_v = movement_metrics("left_ankle")
             ra_d, ra_v = movement_metrics("right_ankle")
-            # Joint angles (pixel coords for stability)
             def get_angle(a, b, c):
                 if a is None or b is None or c is None:
                     return None
@@ -285,19 +365,19 @@ def process_video(
             right_knee_ang = get_angle(pose_px["right_hip"], pose_px["right_knee"], pose_px["right_ankle"])
             # Draw overlays
-            draw_pose(frame_bgr, pose_res)
-            draw_face(frame_bgr, face_res, draw_full_mesh=draw_full_face_mesh)
             # HUD text
             hud_lines = [
-                f"frame: {frame_idx}/{total_frames if total_frames>0 else '?'}  fps:{fps:.1f}",
                 f"EAR L:{left_ear:.3f}" if left_ear is not None else "EAR L:None",
                 f"EAR R:{right_ear:.3f}" if right_ear is not None else "EAR R:None",
-                f"Blink L:{left_blink.blink_count}  R:{right_blink.blink_count}",
             ]
             y0 = 24
             for line in hud_lines:
-                cv2.putText(frame_bgr, line, (12, y0), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
                 y0 += 22
             writer.write(frame_bgr)
@@ -305,20 +385,16 @@ def process_video(
             rows.append({
                 "frame": frame_idx,
                 "time_s": (frame_idx - 1) / fps,
                 "left_ear": left_ear,
                 "right_ear": right_ear,
                 "lw_disp": lw_d,
                 "rw_disp": rw_d,
                 "la_disp": la_d,
                 "ra_disp": ra_d,
                 "lw_speed": lw_v,
                 "rw_speed": rw_v,
                 "la_speed": la_v,
                 "ra_speed": ra_v,
                 "left_elbow_angle": left_elbow_ang,
                 "right_elbow_angle": right_elbow_ang,
                 "left_knee_angle": left_knee_ang,
@@ -337,7 +413,6 @@ def process_video(
             return {"mean": None, "min": None, "max": None}
         return {"mean": float(s2.mean()), "min": float(s2.min()), "max": float(s2.max())}
-    # movement totals in normalized units (roughly proportional)
     summary = {
         "video": {
             "fps": float(fps),
@@ -383,34 +458,36 @@ def process_video(
     with open(out_json, "w", encoding="utf-8") as f:
         json.dump(summary, f, ensure_ascii=False, indent=2)
-    report_md = f"""# MediaPipe Pose + FaceLandmarks Analysis Report
-## Video Information
-- Resolution: {width} x {height}
 - FPS: {fps:.2f}
-- Frames Processed: {len(df)}
-- Duration (seconds): {summary["video"]["duration_s"]:.2f}
-## Blink Analysis (EAR)
-- Threshold: {ear_threshold}
-- Minimum Consecutive Frames: {blink_min_consec}
-- Left Eye Blinks: {summary["blink"]["left_blinks"]} ({summary["blink"]["left_blinks_per_min"]:.2f} blinks/min)
-- Right Eye Blinks: {summary["blink"]["right_blinks"]} ({summary["blink"]["right_blinks_per_min"]:.2f} blinks/min)
-- Left Eye EAR: mean={summary["blink"]["left_ear_stats"]["mean"]}  min={summary["blink"]["left_ear_stats"]["min"]}  max={summary["blink"]["left_ear_stats"]["max"]}
-- Right Eye EAR: mean={summary["blink"]["right_ear_stats"]["mean"]} min={summary["blink"]["right_ear_stats"]["min"]} max={summary["blink"]["right_ear_stats"]["max"]}
-## Limb Movement (normalized units)
-> Displacement/speed based on normalized coordinates (0~1), suitable for relative comparison and trend analysis.
-- Total Displacement (higher = more movement):
-  - Left Wrist: {summary["limb_movement"]["total_disp"]["left_wrist"]:.6f}
-  - Right Wrist: {summary["limb_movement"]["total_disp"]["right_wrist"]:.6f}
-  - Left Ankle: {summary["limb_movement"]["total_disp"]["left_ankle"]:.6f}
-  - Right Ankle: {summary["limb_movement"]["total_disp"]["right_ankle"]:.6f}
-## Output Files
-- annotated.mp4: Video with Pose and FaceMesh overlays
-- per_frame_metrics.csv: Frame-by-frame metrics (EAR / displacement / speed / joint angles)
-- summary.json: Statistical summary
 """
     with open(out_report, "w", encoding="utf-8") as f:
         f.write(report_md)
@@ -423,16 +500,15 @@ def process_video(
 # -------------------------
 def ui_process(
     video,
-    pose_model_complexity,
     min_pose_det_conf,
     min_pose_track_conf,
-    min_face_det_conf,
     ear_threshold,
     blink_min_consec,
     draw_full_face_mesh,
     max_frames
 ):
-    # video may be dict in some gradio versions
     if isinstance(video, dict) and "path" in video:
         video_path = video["path"]
     else:
@@ -441,64 +517,78 @@ def ui_process(
     try:
         out_video, out_csv, out_json, out_report = process_video(
             video_path=str(video_path),
-            pose_model_complexity=int(pose_model_complexity),
             min_pose_det_conf=float(min_pose_det_conf),
             min_pose_track_conf=float(min_pose_track_conf),
-            min_face_det_conf=float(min_face_det_conf),
             ear_threshold=float(ear_threshold),
             blink_min_consec=int(blink_min_consec),
             draw_full_face_mesh=bool(draw_full_face_mesh),
             max_frames=int(max_frames),
         )
-        # Show report text + return files
         with open(out_report, "r", encoding="utf-8") as f:
             report_text = f.read()
         return out_video, out_csv, out_json, report_text
     except Exception as e:
-        error_msg = f"# Error Processing Video\n\n{str(e)}"
         return None, None, None, error_msg
-demo = gr.Blocks(title="Video Pose + FaceLandmarks + Blink/Limb Analytics")
 with demo:
-    gr.Markdown("## Upload Video → MediaPipe Pose + FaceMesh → Limb Movement & Blink Quantification (EAR)")
     with gr.Row():
-        video_in = gr.Video(label="Upload Video")
-    with gr.Accordion("Parameters (defaults work well)", open=False):
-        pose_model_complexity = gr.Radio([0, 1, 2], value=1, label="Pose model_complexity (0=fast / 2=accurate)")
-        min_pose_det_conf = gr.Slider(0.1, 0.9, value=0.5, step=0.05, label="Pose min_detection_confidence")
-        min_pose_track_conf = gr.Slider(0.1, 0.9, value=0.5, step=0.05, label="Pose min_tracking_confidence")
-        min_face_det_conf = gr.Slider(0.1, 0.9, value=0.5, step=0.05, label="Face min_detection_confidence")
-        ear_threshold = gr.Slider(0.10, 0.35, value=0.21, step=0.01, label="Blink Threshold EAR (lower = stricter)")
-        blink_min_consec = gr.Slider(1, 6, value=2, step=1, label="Blink Min Consecutive Frames (anti-jitter)")
-        draw_full_face_mesh = gr.Checkbox(value=False, label="Overlay Full FaceMesh (denser/slower)")
-        max_frames = gr.Number(value=0, precision=0, label="Max Frames to Process (0=all, set 300 for debugging)")
-    run_btn = gr.Button("Start Analysis", variant="primary")
     with gr.Row():
-        video_out = gr.Video(label="Output: Annotated Video")
     with gr.Row():
-        csv_out = gr.File(label="Per-Frame Metrics CSV")
-        json_out = gr.File(label="Summary JSON")
     report_out = gr.Markdown()
     run_btn.click(
         fn=ui_process,
         inputs=[
             video_in,
-            pose_model_complexity,
             min_pose_det_conf,
             min_pose_track_conf,
-            min_face_det_conf,
             ear_threshold,
             blink_min_consec,
             draw_full_face_mesh,

 import tempfile
 from dataclasses import dataclass
 from typing import Dict, List, Tuple, Optional
+import urllib.request
 import cv2
 import numpy as np
 import pandas as pd
 import gradio as gr
 import mediapipe as mp
+from mediapipe import solutions
+from mediapipe.framework.formats import landmark_pb2
+from mediapipe.tasks import python
+from mediapipe.tasks.python import vision
+# -------------------------
+# Model download helper
+# -------------------------
+def download_models():
+    """Download required MediaPipe models if not present"""
+    models_dir = "/tmp/mediapipe_models"
+    os.makedirs(models_dir, exist_ok=True)
+    models = {
+        "face_landmarker": {
+            "url": "https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task",
+            "path": os.path.join(models_dir, "face_landmarker.task")
+        },
+        "pose_landmarker": {
+            "url": "https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_heavy/float16/1/pose_landmarker_heavy.task",
+            "path": os.path.join(models_dir, "pose_landmarker_heavy.task")
+        }
+    }
+    for model_name, model_info in models.items():
+        if not os.path.exists(model_info["path"]):
+            print(f"Downloading {model_name}...")
+            urllib.request.urlretrieve(model_info["url"], model_info["path"])
+            print(f"✓ Downloaded {model_name}")
+    return models["face_landmarker"]["path"], models["pose_landmarker"]["path"]
 # -------------------------
 # -------------------------
 # MediaPipe indices
 # -------------------------
+# FaceMesh landmarks for EAR (same indices work for new API)
 LEFT_EYE_EAR_IDX  = [33, 160, 158, 133, 153, 144]
 RIGHT_EYE_EAR_IDX = [362, 385, 387, 263, 373, 380]
+# Pose landmark indices for new API
+POSE_LANDMARKS = {
+    "left_wrist": 15,
+    "right_wrist": 16,
+    "left_ankle": 27,
+    "right_ankle": 28,
+    "left_shoulder": 11,
+    "right_shoulder": 12,
+    "left_elbow": 13,
+    "right_elbow": 14,
+    "left_hip": 23,
+    "right_hip": 24,
+    "left_knee": 25,
+    "right_knee": 26,
 }
 # -------------------------
+# Drawing helpers for new API
 # -------------------------
+mp_drawing = solutions.drawing_utils
+mp_drawing_styles = solutions.drawing_styles
+# Face mesh connections
+FACEMESH_TESSELATION = solutions.face_mesh.FACEMESH_TESSELATION
+FACEMESH_CONTOURS = solutions.face_mesh.FACEMESH_CONTOURS
+# Pose connections
+POSE_CONNECTIONS = solutions.pose.POSE_CONNECTIONS
+def draw_face_landmarks(image, face_landmarks, draw_full_mesh=False):
+    """Draw face landmarks on image using new API format"""
+    if face_landmarks is None:
         return
+    # Convert to landmark_pb2 format for drawing
+    face_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
+    face_landmarks_proto.landmark.extend([
+        landmark_pb2.NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z)
+        for lm in face_landmarks
+    ])
+    if draw_full_mesh:
         mp_drawing.draw_landmarks(
+            image=image,
+            landmark_list=face_landmarks_proto,
+            connections=FACEMESH_TESSELATION,
             landmark_drawing_spec=None,
+            connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_tesselation_style()
         )
+    mp_drawing.draw_landmarks(
+        image=image,
+        landmark_list=face_landmarks_proto,
+        connections=FACEMESH_CONTOURS,
+        landmark_drawing_spec=None,
+        connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_contours_style()
+    )
+def draw_pose_landmarks(image, pose_landmarks):
+    """Draw pose landmarks on image using new API format"""
+    if pose_landmarks is None:
+        return
+    # Convert to landmark_pb2 format for drawing
+    pose_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
+    pose_landmarks_proto.landmark.extend([
+        landmark_pb2.NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z)
+        for lm in pose_landmarks
+    ])
+    mp_drawing.draw_landmarks(
+        image=image,
+        landmark_list=pose_landmarks_proto,
+        connections=POSE_CONNECTIONS,
+        landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style()
+    )
 # -------------------------
     - when ear goes back above => blink end (count once)
     """
     if ear is None:
         return state
     if ear < thr:
 # -------------------------
+# Core processing with new API
 # -------------------------
 def process_video(
     video_path: str,
+    min_face_det_conf: float = 0.5,
+    min_face_track_conf: float = 0.5,
     min_pose_det_conf: float = 0.5,
     min_pose_track_conf: float = 0.5,
     ear_threshold: float = 0.21,
     blink_min_consec: int = 2,
     draw_full_face_mesh: bool = False,
+    max_frames: int = 0,
 ) -> Tuple[str, str, str, str]:
     """
+    Process video using new MediaPipe API with GPU support
     """
+    # Download models first
+    face_model_path, pose_model_path = download_models()
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
         raise RuntimeError("Cannot open video. Please upload a valid video file.")
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # Output paths
     tmpdir = tempfile.mkdtemp(prefix="mp_analysis_")
     out_video = os.path.join(tmpdir, "annotated.mp4")
     out_csv = os.path.join(tmpdir, "per_frame_metrics.csv")
     fourcc = cv2.VideoWriter_fourcc(*"mp4v")
     writer = cv2.VideoWriter(out_video, fourcc, fps, (width, height))
+    # Create face landmarker with GPU delegate
+    base_options_face = python.BaseOptions(
+        model_asset_path=face_model_path,
+        delegate=python.BaseOptions.Delegate.GPU
+    )
+    face_options = vision.FaceLandmarkerOptions(
+        base_options=base_options_face,
+        running_mode=vision.RunningMode.VIDEO,
+        num_faces=1,
+        min_face_detection_confidence=min_face_det_conf,
+        min_face_presence_confidence=min_face_track_conf,
+        min_tracking_confidence=min_face_track_conf,
+        output_face_blendshapes=False,
+        output_facial_transformation_matrixes=False
+    )
+    # Create pose landmarker with GPU delegate
+    base_options_pose = python.BaseOptions(
+        model_asset_path=pose_model_path,
+        delegate=python.BaseOptions.Delegate.GPU
+    )
+    pose_options = vision.PoseLandmarkerOptions(
+        base_options=base_options_pose,
+        running_mode=vision.RunningMode.VIDEO,
+        num_poses=1,
+        min_pose_detection_confidence=min_pose_det_conf,
+        min_pose_presence_confidence=min_pose_track_conf,
         min_tracking_confidence=min_pose_track_conf,
+        output_segmentation_masks=False
+    )
+    with vision.FaceLandmarker.create_from_options(face_options) as face_landmarker, \
+         vision.PoseLandmarker.create_from_options(pose_options) as pose_landmarker:
         rows = []
+        prev_pts = {}
         left_blink = BlinkState()
         right_blink = BlinkState()
             if max_frames and frame_idx > max_frames:
                 break
+            # Convert to RGB and create MediaPipe Image
             frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+            mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame_rgb)
+            # Timestamp in milliseconds
+            timestamp_ms = int((frame_idx - 1) * 1000 / fps)
+            # Process with new API
+            face_result = face_landmarker.detect_for_video(mp_image, timestamp_ms)
+            pose_result = pose_landmarker.detect_for_video(mp_image, timestamp_ms)
+            # Extract face landmarks
             face_pts: Dict[int, np.ndarray] = {}
+            face_landmarks = None
+            if face_result.face_landmarks:
+                face_landmarks = face_result.face_landmarks[0]
+                for i, lm in enumerate(face_landmarks):
+                    face_pts[i] = np.array([lm.x * width, lm.y * height], dtype=np.float32)
+            # Calculate EAR
             left_ear = eye_aspect_ratio(face_pts, LEFT_EYE_EAR_IDX)
             right_ear = eye_aspect_ratio(face_pts, RIGHT_EYE_EAR_IDX)
             left_blink = update_blink(left_blink, left_ear, ear_threshold, blink_min_consec)
             right_blink = update_blink(right_blink, right_ear, ear_threshold, blink_min_consec)
+            # Extract pose landmarks
             pose_norm: Dict[str, Optional[np.ndarray]] = {}
             pose_px: Dict[str, Optional[np.ndarray]] = {}
+            pose_landmarks = None
+            if pose_result.pose_landmarks:
+                pose_landmarks = pose_result.pose_landmarks[0]
+                for name, idx in POSE_LANDMARKS.items():
+                    if idx < len(pose_landmarks):
+                        lm = pose_landmarks[idx]
+                        pose_norm[name] = np.array([lm.x, lm.y], dtype=np.float32)
+                        pose_px[name] = np.array([lm.x * width, lm.y * height], dtype=np.float32)
                     else:
                         pose_norm[name] = None
                         pose_px[name] = None
             else:
+                for name in POSE_LANDMARKS:
                     pose_norm[name] = None
                     pose_px[name] = None
+            # Movement metrics
             def movement_metrics(key: str):
                 cur = pose_norm.get(key)
                 if cur is None:
             la_d, la_v = movement_metrics("left_ankle")
             ra_d, ra_v = movement_metrics("right_ankle")
+            # Joint angles
             def get_angle(a, b, c):
                 if a is None or b is None or c is None:
                     return None
             right_knee_ang = get_angle(pose_px["right_hip"], pose_px["right_knee"], pose_px["right_ankle"])
             # Draw overlays
+            draw_pose_landmarks(frame_bgr, pose_landmarks)
+            draw_face_landmarks(frame_bgr, face_landmarks, draw_full_mesh=draw_full_face_mesh)
             # HUD text
             hud_lines = [
+                f"Frame: {frame_idx}/{total_frames if total_frames>0 else '?'}  FPS:{fps:.1f}",
                 f"EAR L:{left_ear:.3f}" if left_ear is not None else "EAR L:None",
                 f"EAR R:{right_ear:.3f}" if right_ear is not None else "EAR R:None",
+                f"Blinks L:{left_blink.blink_count}  R:{right_blink.blink_count}",
             ]
             y0 = 24
             for line in hud_lines:
+                cv2.putText(frame_bgr, line, (12, y0), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
                 y0 += 22
             writer.write(frame_bgr)
             rows.append({
                 "frame": frame_idx,
                 "time_s": (frame_idx - 1) / fps,
                 "left_ear": left_ear,
                 "right_ear": right_ear,
                 "lw_disp": lw_d,
                 "rw_disp": rw_d,
                 "la_disp": la_d,
                 "ra_disp": ra_d,
                 "lw_speed": lw_v,
                 "rw_speed": rw_v,
                 "la_speed": la_v,
                 "ra_speed": ra_v,
                 "left_elbow_angle": left_elbow_ang,
                 "right_elbow_angle": right_elbow_ang,
                 "left_knee_angle": left_knee_ang,
             return {"mean": None, "min": None, "max": None}
         return {"mean": float(s2.mean()), "min": float(s2.min()), "max": float(s2.max())}
     summary = {
         "video": {
             "fps": float(fps),
     with open(out_json, "w", encoding="utf-8") as f:
         json.dump(summary, f, ensure_ascii=False, indent=2)
+    report_md = f"""# MediaPipe 面部+姿态分析报告 (GPU加速)
+## 视频信息
+- 分辨率: {width} x {height}
 - FPS: {fps:.2f}
+- 处理帧数: {len(df)}
+- 时长: {summary["video"]["duration_s"]:.2f}秒
+## 眨眼分析 (EAR)
+- 阈值: {ear_threshold}
+- 最小连续帧: {blink_min_consec}
+- 左眼眨眼: {summary["blink"]["left_blinks"]}次 ({summary["blink"]["left_blinks_per_min"]:.2f} 次/分钟)
+- 右眼眨眼: {summary["blink"]["right_blinks"]}次 ({summary["blink"]["right_blinks_per_min"]:.2f} 次/分钟)
+- 左眼EAR: 平均={summary["blink"]["left_ear_stats"]["mean"]}  最小={summary["blink"]["left_ear_stats"]["min"]}  最大={summary["blink"]["left_ear_stats"]["max"]}
+- 右眼EAR: 平均={summary["blink"]["right_ear_stats"]["mean"]} 最小={summary["blink"]["right_ear_stats"]["min"]} 最大={summary["blink"]["right_ear_stats"]["max"]}
+## 肢体运动量 (归一化单位)
+> 基于归一化坐标(0~1)计算，适合相对比较和趋势分析
+- 累计位移 (数值越大=运动越多):
+  - 左手腕: {summary["limb_movement"]["total_disp"]["left_wrist"]:.6f}
+  - 右手腕: {summary["limb_movement"]["total_disp"]["right_wrist"]:.6f}
+  - 左脚踝: {summary["limb_movement"]["total_disp"]["left_ankle"]:.6f}
+  - 右脚踝: {summary["limb_movement"]["total_disp"]["right_ankle"]:.6f}
+## 输出文件
+- annotated.mp4: 叠加了姿态和面部mesh的视频
+- per_frame_metrics.csv: 逐帧指标
+- summary.json: 统计汇总
+**使用GPU加速处理 | 新版Face Landmarker API**
 """
     with open(out_report, "w", encoding="utf-8") as f:
         f.write(report_md)
 # -------------------------
 def ui_process(
     video,
+    min_face_det_conf,
+    min_face_track_conf,
     min_pose_det_conf,
     min_pose_track_conf,
     ear_threshold,
     blink_min_consec,
     draw_full_face_mesh,
     max_frames
 ):
     if isinstance(video, dict) and "path" in video:
         video_path = video["path"]
     else:
     try:
         out_video, out_csv, out_json, out_report = process_video(
             video_path=str(video_path),
+            min_face_det_conf=float(min_face_det_conf),
+            min_face_track_conf=float(min_face_track_conf),
             min_pose_det_conf=float(min_pose_det_conf),
             min_pose_track_conf=float(min_pose_track_conf),
             ear_threshold=float(ear_threshold),
             blink_min_consec=int(blink_min_consec),
             draw_full_face_mesh=bool(draw_full_face_mesh),
             max_frames=int(max_frames),
         )
         with open(out_report, "r", encoding="utf-8") as f:
             report_text = f.read()
         return out_video, out_csv, out_json, report_text
     except Exception as e:
+        import traceback
+        error_msg = f"# 处理视频时出错\n\n```\n{traceback.format_exc()}\n```"
         return None, None, None, error_msg
+demo = gr.Blocks(title="视频姿态+面部分析 (GPU加速)")
 with demo:
+    gr.Markdown("""
+    ## 上传视频 → MediaPipe GPU加速 → 姿态+面部mesh追踪 + 眨眼/肢体运动分析
+    **特性:**
+    - ✅ GPU加速处理
+    - ✅ 新版Face Landmarker API (更精确的面部mesh)
+    - ✅ 眨眼检测 (EAR算法)
+    - ✅ 肢体运动量化
+    - ✅ 关节角度分析
+    """)
     with gr.Row():
+        video_in = gr.Video(label="上传视频")
+    with gr.Accordion("参数设置 (默认值通常就够用)", open=False):
+        gr.Markdown("### 面部检测参数")
+        min_face_det_conf = gr.Slider(0.1, 0.9, value=0.5, step=0.05, label="面部检测置信度阈值")
+        min_face_track_conf = gr.Slider(0.1, 0.9, value=0.5, step=0.05, label="面部追踪置信度阈值")
+        gr.Markdown("### 姿态检测参数")
+        min_pose_det_conf = gr.Slider(0.1, 0.9, value=0.5, step=0.05, label="姿态检测置信度阈值")
+        min_pose_track_conf = gr.Slider(0.1, 0.9, value=0.5, step=0.05, label="姿态追踪置信度阈值")
+        gr.Markdown("### 眨眼检测参数")
+        ear_threshold = gr.Slider(0.10, 0.35, value=0.21, step=0.01, label="眨眼阈值 (EAR, 越小越严格)")
+        blink_min_consec = gr.Slider(1, 6, value=2, step=1, label="眨眼最小连续帧数 (抗抖动)")
+        gr.Markdown("### 可视化选项")
+        draw_full_face_mesh = gr.Checkbox(value=False, label="绘制完整面部mesh (更密集，速度较慢)")
+        max_frames = gr.Number(value=0, precision=0, label="最多处理帧数 (0=全部处理，调试可设300)")
+    run_btn = gr.Button("🚀 开始分析 (GPU加速)", variant="primary", size="lg")
     with gr.Row():
+        video_out = gr.Video(label="输出: 标注后的视频")
     with gr.Row():
+        csv_out = gr.File(label="逐帧指标CSV")
+        json_out = gr.File(label="汇总JSON")
     report_out = gr.Markdown()
     run_btn.click(
         fn=ui_process,
         inputs=[
             video_in,
+            min_face_det_conf,
+            min_face_track_conf,
             min_pose_det_conf,
             min_pose_track_conf,
             ear_threshold,
             blink_min_consec,
             draw_full_face_mesh,