Spaces:

XiaoBai1221
/

SignView2.0

Sleeping

XiaoBai1221 commited on Jun 23, 2025

Commit

de897a8

1 Parent(s): 2101309

🎯 修正特徵提取以匹配訓練時的方法

**核心修正：**
- 使用 mp.solutions.holistic.Holistic (與訓練一致)
- 修正關鍵點順序：手部(左+右) + 姿勢 (63+63+99=225)
- 實現與訓練時相同的區域性光流計算
- 添加手部遮罩創建 (create_hand_mask)
- 使用 calcOpticalFlowFarneback 替代 calcOpticalFlowPyrLK

**技術改進：**
- 每幀同時提取關鍵點和MediaPipe結果
- 基於實際檢測到的關鍵點創建ROI遮罩
- 統一數據類型為 np.float32/np.float16
- 與sign_language_recognition.py保持完全一致

**預期效果：**
- 大幅提升預測準確率
- 特徵提取與訓練時100%匹配

Files changed (1) hide show

app.py +139 -84

app.py CHANGED Viewed

@@ -268,102 +268,140 @@ else:
     raise FileNotFoundError(f"模型檔案不存在: {model_path}")
 def extract_keypoints_from_frame(frame):
-    """從單個frame提取關鍵點"""
     try:
-        with mp_pose.Pose(static_image_mode=True, model_complexity=1) as pose, \
-             mp_hands.Hands(static_image_mode=True, max_num_hands=2) as hands, \
-             mp_face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1) as face_mesh:
-            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             keypoints = []
-            # 提取姿勢關鍵點 (33個點 * 3維 = 99)
-            pose_results = pose.process(rgb_frame)
-            if pose_results.pose_landmarks:
-                pose_points = []
-                for landmark in pose_results.pose_landmarks.landmark:
-                    pose_points.extend([landmark.x, landmark.y, landmark.z])
-                keypoints.extend(pose_points)
             else:
-                keypoints.extend([0.0] * 99)
-            # 提取手部關鍵點 (21個點 * 2隻手 * 3維 = 126)
-            hands_results = hands.process(rgb_frame)
-            if hands_results.multi_hand_landmarks:
-                hand_points = []
-                for hand_landmarks in hands_results.multi_hand_landmarks:
-                    for landmark in hand_landmarks.landmark:
-                        hand_points.extend([landmark.x, landmark.y, landmark.z])
-                # 確保有126個手部關鍵點 (2隻手)
-                if len(hand_points) >= 126:
-                    keypoints.extend(hand_points[:126])
-                else:
-                    keypoints.extend(hand_points + [0.0] * (126 - len(hand_points)))
             else:
-                keypoints.extend([0.0] * 126)
-            # 如果需要，確保總共225個特徵
-            while len(keypoints) < 225:
-                keypoints.append(0.0)
-            return np.array(keypoints[:225], dtype=np.float32)
     except Exception as e:
         print(f"關鍵點提取錯誤: {e}")
-        return np.zeros(225, dtype=np.float32)
-def calculate_optical_flow_features(frame1, frame2):
-    """計算光流特徵"""
     try:
-        # 轉為灰階
-        gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
-        gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
-        # 檢測角點特徵
-        corners = cv2.goodFeaturesToTrack(gray1, maxCorners=100, qualityLevel=0.3, minDistance=7, blockSize=7)
-        # 如果沒有檢測到足夠的角點，返回零向量
-        if corners is None or len(corners) < 5:
-            return np.zeros(10)
-        # 確保角點格式正確
-        corners = np.float32(corners).reshape(-1, 1, 2)
-        # 計算光流
-        new_corners, status, error = cv2.calcOpticalFlowPyrLK(gray1, gray2, corners, None)
-        # 選擇好的角點
-        good_new = new_corners[status == 1]
-        good_old = corners[status == 1]
-        # 如果沒有足夠的好角點，返回零向量
-        if len(good_new) < 2 or len(good_old) < 2:
-            return np.zeros(10)
-        # 計算光流向量
-        flow_vectors = good_new - good_old
-        # 計算統計特徵
-        magnitude = np.sqrt(flow_vectors[:, 0]**2 + flow_vectors[:, 1]**2)
-        direction = np.arctan2(flow_vectors[:, 1], flow_vectors[:, 0])
-        # 提取10維特徵
-        features = [
-            np.mean(magnitude), np.std(magnitude), np.max(magnitude), np.min(magnitude),
-            np.mean(direction), np.std(direction),
-            np.mean(flow_vectors[:, 0]), np.std(flow_vectors[:, 0]),
-            np.mean(flow_vectors[:, 1]), np.std(flow_vectors[:, 1])
-        ]
-        # 處理 NaN 值
-        features = [f if not np.isnan(f) else 0.0 for f in features]
-        return np.array(features)
     except Exception as e:
-        print(f"光流計算錯誤: {e}")
-        return np.zeros(10)
 def predict_sign_language(video_path):
     """預測手語影片"""
@@ -382,16 +420,33 @@ def predict_sign_language(video_path):
         if len(frames) == 0:
             return "錯誤：無法讀取影片幀", 0.0
-        # 提取特徵
         keypoints_sequence = []
         for frame in frames:
-            keypoints = extract_keypoints_from_frame(frame)
             keypoints_sequence.append(keypoints)
         # 計算每一幀的光流特徵
         flow_features = []
         for i in range(len(frames) - 1):
-            flow = calculate_optical_flow_features(frames[i], frames[i + 1])
             flow_features.append(flow)
         # 確保光流特徵的幀數與關鍵點一致
@@ -401,7 +456,7 @@ def predict_sign_language(video_path):
                 if flow_features:
                     flow_features.append(flow_features[-1])
                 else:
-                    flow_features.append(np.zeros(10))
         # 確保序列長度為50 (與訓練時一致)
         target_length = 50
@@ -415,10 +470,10 @@ def predict_sign_language(video_path):
             while len(keypoints_sequence) < target_length:
                 if keypoints_sequence:
                     keypoints_sequence.append(keypoints_sequence[-1])
-                    flow_features.append(flow_features[-1] if flow_features else np.zeros(10))
                 else:
-                    keypoints_sequence.append(np.zeros(225))
-                    flow_features.append(np.zeros(10))
         # 轉換為numpy數組再轉為tensor (避免警告)
         keypoints_array = np.array(keypoints_sequence, dtype=np.float32)

     raise FileNotFoundError(f"模型檔案不存在: {model_path}")
 def extract_keypoints_from_frame(frame):
+    """從單個frame提取關鍵點 - 與訓練時一致"""
     try:
+        with mp.solutions.holistic.Holistic(
+            static_image_mode=True,
+            model_complexity=1,
+            min_detection_confidence=0.5,
+            min_tracking_confidence=0.5) as holistic:
+            # 轉換為RGB格式
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame_rgb.flags.writeable = False
+            results = holistic.process(frame_rgb)
+            frame_rgb.flags.writeable = True
             keypoints = []
+            # 提取手部關鍵點 (左手: 21個點 * 3維 = 63)
+            if results.left_hand_landmarks:
+                for landmark in results.left_hand_landmarks.landmark:
+                    keypoints.extend([landmark.x, landmark.y, landmark.z])
             else:
+                keypoints.extend([0] * (21 * 3))
+            # 提取手部關鍵點 (右手: 21個點 * 3維 = 63)
+            if results.right_hand_landmarks:
+                for landmark in results.right_hand_landmarks.landmark:
+                    keypoints.extend([landmark.x, landmark.y, landmark.z])
             else:
+                keypoints.extend([0] * (21 * 3))
+            # 提取姿勢關鍵點 (33個點 * 3維 = 99)
+            if results.pose_landmarks:
+                for landmark in results.pose_landmarks.landmark:
+                    keypoints.extend([landmark.x, landmark.y, landmark.z])
+            else:
+                keypoints.extend([0] * (33 * 3))
+            return np.array(keypoints[:225], dtype=np.float32), results
     except Exception as e:
         print(f"關鍵點提取錯誤: {e}")
+        return np.zeros(225, dtype=np.float32), None
+def create_hand_mask(frame, left_hand_landmarks, right_hand_landmarks, pose_landmarks):
+    """創建手部和上半身的ROI遮罩 - 與訓練時一致"""
+    h, w = frame.shape[:2]
+    mask = np.zeros((h, w), dtype=np.uint8)
+    def draw_landmarks_on_mask(landmarks, radius=15):
+        if landmarks:
+            for landmark in landmarks.landmark:
+                x, y = int(landmark.x * w), int(landmark.y * h)
+                if 0 <= x < w and 0 <= y < h:
+                    cv2.circle(mask, (x, y), radius=radius, color=255, thickness=-1)
+    # 繪製左手關鍵點
+    draw_landmarks_on_mask(left_hand_landmarks, radius=20)
+    # 繪製右手關鍵點
+    draw_landmarks_on_mask(right_hand_landmarks, radius=20)
+    # 只繪製上半身關鍵點 (頭部、肩膀、手臂)
+    if pose_landmarks:
+        upper_body_indices = list(range(0, 25))  # 0-24為上半身關鍵點
+        for idx in upper_body_indices:
+            if idx < len(pose_landmarks.landmark):
+                landmark = pose_landmarks.landmark[idx]
+                x, y = int(landmark.x * w), int(landmark.y * h)
+                if 0 <= x < w and 0 <= y < h:
+                    cv2.circle(mask, (x, y), radius=10, color=255, thickness=-1)
+    # 擴大遮罩區域，使用膨脹操作
+    kernel = np.ones((15, 15), np.uint8)
+    dilated_mask = cv2.dilate(mask, kernel, iterations=1)
+    return dilated_mask
+def compute_regional_optical_flow(prev_frame, curr_frame, mask, downscale=0.5):
+    """計算區域性光流特徵 - 與訓練時一致"""
     try:
+        # 降低解析度
+        if downscale < 1.0:
+            h, w = prev_frame.shape[:2]
+            new_h, new_w = int(h * downscale), int(w * downscale)
+            prev_small = cv2.resize(prev_frame, (new_w, new_h))
+            curr_small = cv2.resize(curr_frame, (new_w, new_h))
+            mask_small = cv2.resize(mask, (new_w, new_h))
+        else:
+            prev_small = prev_frame
+            curr_small = curr_frame
+            mask_small = mask
+        # 轉換為灰度圖
+        prev_gray = cv2.cvtColor(prev_small, cv2.COLOR_BGR2GRAY)
+        curr_gray = cv2.cvtColor(curr_small, cv2.COLOR_BGR2GRAY)
+        # 計算遮罩區域的光流
+        flow = cv2.calcOpticalFlowFarneback(
+            prev_gray, curr_gray,
+            None,  # flow
+            0.5,   # pyr_scale
+            3,     # levels
+            15,    # winsize
+            3,     # iterations
+            5,     # poly_n
+            1.2,   # poly_sigma
+            0      # flags
+        )
+        # 將mask_small轉換為布爾遮罩
+        bool_mask = mask_small > 0
+        # 只計算遮罩區域的光流特徵
+        if np.any(bool_mask):
+            # 提取x和y方向的光流
+            fx = flow[..., 0][bool_mask]
+            fy = flow[..., 1][bool_mask]
+            # 計算統計特徵
+            flow_features = np.array([
+                np.mean(fx), np.std(fx),
+                np.mean(fy), np.std(fy),
+                np.percentile(fx, 25), np.percentile(fx, 75),
+                np.percentile(fy, 25), np.percentile(fy, 75),
+                np.max(np.abs(fx)), np.max(np.abs(fy))
+            ], dtype=np.float16)
+        else:
+            flow_features = np.zeros(10, dtype=np.float16)
+        return flow_features
     except Exception as e:
+        print(f"區域性光流計算錯誤: {e}")
+        return np.zeros(10, dtype=np.float16)
 def predict_sign_language(video_path):
     """預測手語影片"""
         if len(frames) == 0:
             return "錯誤：無法讀取影片幀", 0.0
+        # 提取特徵 - 同時獲取關鍵點和MediaPipe結果
         keypoints_sequence = []
+        all_results = []
         for frame in frames:
+            keypoints, results = extract_keypoints_from_frame(frame)
             keypoints_sequence.append(keypoints)
+            all_results.append(results)
         # 計算每一幀的光流特徵
         flow_features = []
         for i in range(len(frames) - 1):
+            # 使用當前幀的MediaPipe結果創建遮罩
+            current_results = all_results[i]
+            if current_results is not None:
+                mask = create_hand_mask(
+                    frames[i],
+                    current_results.left_hand_landmarks,
+                    current_results.right_hand_landmarks,
+                    current_results.pose_landmarks
+                )
+            else:
+                # 如果沒有MediaPipe結果，創建空遮罩
+                h, w = frames[i].shape[:2]
+                mask = np.zeros((h, w), dtype=np.uint8)
+            flow = compute_regional_optical_flow(frames[i], frames[i + 1], mask)
             flow_features.append(flow)
         # 確保光流特徵的幀數與關鍵點一致
                 if flow_features:
                     flow_features.append(flow_features[-1])
                 else:
+                    flow_features.append(np.zeros(10, dtype=np.float16))
         # 確保序列長度為50 (與訓練時一致)
         target_length = 50
             while len(keypoints_sequence) < target_length:
                 if keypoints_sequence:
                     keypoints_sequence.append(keypoints_sequence[-1])
+                    flow_features.append(flow_features[-1] if flow_features else np.zeros(10, dtype=np.float16))
                 else:
+                    keypoints_sequence.append(np.zeros(225, dtype=np.float32))
+                    flow_features.append(np.zeros(10, dtype=np.float16))
         # 轉換為numpy數組再轉為tensor (避免警告)
         keypoints_array = np.array(keypoints_sequence, dtype=np.float32)