XiaoBai1221 commited on
Commit
de897a8
·
1 Parent(s): 2101309

🎯 修正特徵提取以匹配訓練時的方法

Browse files

**核心修正:**
- 使用 mp.solutions.holistic.Holistic (與訓練一致)
- 修正關鍵點順序:手部(左+右) + 姿勢 (63+63+99=225)
- 實現與訓練時相同的區域性光流計算
- 添加手部遮罩創建 (create_hand_mask)
- 使用 calcOpticalFlowFarneback 替代 calcOpticalFlowPyrLK

**技術改進:**
- 每幀同時提取關鍵點和MediaPipe結果
- 基於實際檢測到的關鍵點創建ROI遮罩
- 統一數據類型為 np.float32/np.float16
- 與sign_language_recognition.py保持完全一致

**預期效果:**
- 大幅提升預測準確率
- 特徵提取與訓練時100%匹配

Files changed (1) hide show
  1. app.py +139 -84
app.py CHANGED
@@ -268,102 +268,140 @@ else:
268
  raise FileNotFoundError(f"模型檔案不存在: {model_path}")
269
 
270
  def extract_keypoints_from_frame(frame):
271
- """從單個frame提取關鍵點"""
272
  try:
273
- with mp_pose.Pose(static_image_mode=True, model_complexity=1) as pose, \
274
- mp_hands.Hands(static_image_mode=True, max_num_hands=2) as hands, \
275
- mp_face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1) as face_mesh:
 
 
276
 
277
- rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
 
 
 
 
278
 
279
  keypoints = []
280
 
281
- # 提取姿勢關鍵點 (33個點 * 3維 = 99)
282
- pose_results = pose.process(rgb_frame)
283
- if pose_results.pose_landmarks:
284
- pose_points = []
285
- for landmark in pose_results.pose_landmarks.landmark:
286
- pose_points.extend([landmark.x, landmark.y, landmark.z])
287
- keypoints.extend(pose_points)
288
  else:
289
- keypoints.extend([0.0] * 99)
290
-
291
- # 提取手部關鍵點 (21個點 * 2隻手 * 3維 = 126)
292
- hands_results = hands.process(rgb_frame)
293
- if hands_results.multi_hand_landmarks:
294
- hand_points = []
295
- for hand_landmarks in hands_results.multi_hand_landmarks:
296
- for landmark in hand_landmarks.landmark:
297
- hand_points.extend([landmark.x, landmark.y, landmark.z])
298
 
299
- # 確保有126個手部關鍵點 (2隻手)
300
- if len(hand_points) >= 126:
301
- keypoints.extend(hand_points[:126])
302
- else:
303
- keypoints.extend(hand_points + [0.0] * (126 - len(hand_points)))
304
  else:
305
- keypoints.extend([0.0] * 126)
306
 
307
- # 如果需要,確保總共225個特徵
308
- while len(keypoints) < 225:
309
- keypoints.append(0.0)
 
 
 
 
 
310
 
311
- return np.array(keypoints[:225], dtype=np.float32)
312
  except Exception as e:
313
  print(f"關鍵點提取錯誤: {e}")
314
- return np.zeros(225, dtype=np.float32)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
- def calculate_optical_flow_features(frame1, frame2):
317
- """計算光流特徵"""
318
  try:
319
- # 轉為灰階
320
- gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
321
- gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
322
-
323
- # 檢測角點特徵
324
- corners = cv2.goodFeaturesToTrack(gray1, maxCorners=100, qualityLevel=0.3, minDistance=7, blockSize=7)
325
-
326
- # 如果沒有檢測到足夠的角點,返回零向量
327
- if corners is None or len(corners) < 5:
328
- return np.zeros(10)
329
-
330
- # 確保角點格式正確
331
- corners = np.float32(corners).reshape(-1, 1, 2)
332
-
333
- # 計算光流
334
- new_corners, status, error = cv2.calcOpticalFlowPyrLK(gray1, gray2, corners, None)
335
-
336
- # 選擇好的角點
337
- good_new = new_corners[status == 1]
338
- good_old = corners[status == 1]
339
-
340
- # 如果沒有足夠的好角點,返回零向量
341
- if len(good_new) < 2 or len(good_old) < 2:
342
- return np.zeros(10)
343
-
344
- # 計算光流向量
345
- flow_vectors = good_new - good_old
346
-
347
- # 計算統計特徵
348
- magnitude = np.sqrt(flow_vectors[:, 0]**2 + flow_vectors[:, 1]**2)
349
- direction = np.arctan2(flow_vectors[:, 1], flow_vectors[:, 0])
350
 
351
- # 提取10維特徵
352
- features = [
353
- np.mean(magnitude), np.std(magnitude), np.max(magnitude), np.min(magnitude),
354
- np.mean(direction), np.std(direction),
355
- np.mean(flow_vectors[:, 0]), np.std(flow_vectors[:, 0]),
356
- np.mean(flow_vectors[:, 1]), np.std(flow_vectors[:, 1])
357
- ]
358
 
359
- # 處理 NaN 值
360
- features = [f if not np.isnan(f) else 0.0 for f in features]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
- return np.array(features)
363
 
364
  except Exception as e:
365
- print(f"光流計算錯誤: {e}")
366
- return np.zeros(10)
367
 
368
  def predict_sign_language(video_path):
369
  """預測手語影片"""
@@ -382,16 +420,33 @@ def predict_sign_language(video_path):
382
  if len(frames) == 0:
383
  return "錯誤:無法讀取影片幀", 0.0
384
 
385
- # 提取特徵
386
  keypoints_sequence = []
 
 
387
  for frame in frames:
388
- keypoints = extract_keypoints_from_frame(frame)
389
  keypoints_sequence.append(keypoints)
 
390
 
391
  # 計算每一幀的光流特徵
392
  flow_features = []
393
  for i in range(len(frames) - 1):
394
- flow = calculate_optical_flow_features(frames[i], frames[i + 1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  flow_features.append(flow)
396
 
397
  # 確保光流特徵的幀數與關鍵點一致
@@ -401,7 +456,7 @@ def predict_sign_language(video_path):
401
  if flow_features:
402
  flow_features.append(flow_features[-1])
403
  else:
404
- flow_features.append(np.zeros(10))
405
 
406
  # 確保序列長度為50 (與訓練時一致)
407
  target_length = 50
@@ -415,10 +470,10 @@ def predict_sign_language(video_path):
415
  while len(keypoints_sequence) < target_length:
416
  if keypoints_sequence:
417
  keypoints_sequence.append(keypoints_sequence[-1])
418
- flow_features.append(flow_features[-1] if flow_features else np.zeros(10))
419
  else:
420
- keypoints_sequence.append(np.zeros(225))
421
- flow_features.append(np.zeros(10))
422
 
423
  # 轉換為numpy數組再轉為tensor (避免警告)
424
  keypoints_array = np.array(keypoints_sequence, dtype=np.float32)
 
268
  raise FileNotFoundError(f"模型檔案不存在: {model_path}")
269
 
270
  def extract_keypoints_from_frame(frame):
271
+ """從單個frame提取關鍵點 - 與訓練時一致"""
272
  try:
273
+ with mp.solutions.holistic.Holistic(
274
+ static_image_mode=True,
275
+ model_complexity=1,
276
+ min_detection_confidence=0.5,
277
+ min_tracking_confidence=0.5) as holistic:
278
 
279
+ # 轉換為RGB格式
280
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
281
+ frame_rgb.flags.writeable = False
282
+ results = holistic.process(frame_rgb)
283
+ frame_rgb.flags.writeable = True
284
 
285
  keypoints = []
286
 
287
+ # 提取手部關鍵點 (左手: 21個點 * 3維 = 63)
288
+ if results.left_hand_landmarks:
289
+ for landmark in results.left_hand_landmarks.landmark:
290
+ keypoints.extend([landmark.x, landmark.y, landmark.z])
 
 
 
291
  else:
292
+ keypoints.extend([0] * (21 * 3))
 
 
 
 
 
 
 
 
293
 
294
+ # 提取手部關鍵點 (右手: 21個點 * 3維 = 63)
295
+ if results.right_hand_landmarks:
296
+ for landmark in results.right_hand_landmarks.landmark:
297
+ keypoints.extend([landmark.x, landmark.y, landmark.z])
 
298
  else:
299
+ keypoints.extend([0] * (21 * 3))
300
 
301
+ # 提取姿勢關鍵點 (33個點 * 3維 = 99)
302
+ if results.pose_landmarks:
303
+ for landmark in results.pose_landmarks.landmark:
304
+ keypoints.extend([landmark.x, landmark.y, landmark.z])
305
+ else:
306
+ keypoints.extend([0] * (33 * 3))
307
+
308
+ return np.array(keypoints[:225], dtype=np.float32), results
309
 
 
310
  except Exception as e:
311
  print(f"關鍵點提取錯誤: {e}")
312
+ return np.zeros(225, dtype=np.float32), None
313
+
314
+ def create_hand_mask(frame, left_hand_landmarks, right_hand_landmarks, pose_landmarks):
315
+ """創建手部和上半身的ROI遮罩 - 與訓練時一致"""
316
+ h, w = frame.shape[:2]
317
+ mask = np.zeros((h, w), dtype=np.uint8)
318
+
319
+ def draw_landmarks_on_mask(landmarks, radius=15):
320
+ if landmarks:
321
+ for landmark in landmarks.landmark:
322
+ x, y = int(landmark.x * w), int(landmark.y * h)
323
+ if 0 <= x < w and 0 <= y < h:
324
+ cv2.circle(mask, (x, y), radius=radius, color=255, thickness=-1)
325
+
326
+ # 繪製左手關鍵點
327
+ draw_landmarks_on_mask(left_hand_landmarks, radius=20)
328
+
329
+ # 繪製右手關鍵點
330
+ draw_landmarks_on_mask(right_hand_landmarks, radius=20)
331
+
332
+ # 只繪製上半身關鍵點 (頭部、肩膀、手臂)
333
+ if pose_landmarks:
334
+ upper_body_indices = list(range(0, 25)) # 0-24為上半身關鍵點
335
+ for idx in upper_body_indices:
336
+ if idx < len(pose_landmarks.landmark):
337
+ landmark = pose_landmarks.landmark[idx]
338
+ x, y = int(landmark.x * w), int(landmark.y * h)
339
+ if 0 <= x < w and 0 <= y < h:
340
+ cv2.circle(mask, (x, y), radius=10, color=255, thickness=-1)
341
+
342
+ # 擴大遮罩區域,使用膨脹操作
343
+ kernel = np.ones((15, 15), np.uint8)
344
+ dilated_mask = cv2.dilate(mask, kernel, iterations=1)
345
+
346
+ return dilated_mask
347
 
348
+ def compute_regional_optical_flow(prev_frame, curr_frame, mask, downscale=0.5):
349
+ """計算區域性光流特徵 - 與訓練時一致"""
350
  try:
351
+ # 降低解析度
352
+ if downscale < 1.0:
353
+ h, w = prev_frame.shape[:2]
354
+ new_h, new_w = int(h * downscale), int(w * downscale)
355
+ prev_small = cv2.resize(prev_frame, (new_w, new_h))
356
+ curr_small = cv2.resize(curr_frame, (new_w, new_h))
357
+ mask_small = cv2.resize(mask, (new_w, new_h))
358
+ else:
359
+ prev_small = prev_frame
360
+ curr_small = curr_frame
361
+ mask_small = mask
362
+
363
+ # 轉換為灰度圖
364
+ prev_gray = cv2.cvtColor(prev_small, cv2.COLOR_BGR2GRAY)
365
+ curr_gray = cv2.cvtColor(curr_small, cv2.COLOR_BGR2GRAY)
366
+
367
+ # 計算遮罩區域的光流
368
+ flow = cv2.calcOpticalFlowFarneback(
369
+ prev_gray, curr_gray,
370
+ None, # flow
371
+ 0.5, # pyr_scale
372
+ 3, # levels
373
+ 15, # winsize
374
+ 3, # iterations
375
+ 5, # poly_n
376
+ 1.2, # poly_sigma
377
+ 0 # flags
378
+ )
 
 
 
379
 
380
+ # 將mask_small轉換為布爾遮罩
381
+ bool_mask = mask_small > 0
 
 
 
 
 
382
 
383
+ # 只計算遮罩區域的光流特徵
384
+ if np.any(bool_mask):
385
+ # 提取x和y方向的光流
386
+ fx = flow[..., 0][bool_mask]
387
+ fy = flow[..., 1][bool_mask]
388
+
389
+ # 計算統計特徵
390
+ flow_features = np.array([
391
+ np.mean(fx), np.std(fx),
392
+ np.mean(fy), np.std(fy),
393
+ np.percentile(fx, 25), np.percentile(fx, 75),
394
+ np.percentile(fy, 25), np.percentile(fy, 75),
395
+ np.max(np.abs(fx)), np.max(np.abs(fy))
396
+ ], dtype=np.float16)
397
+ else:
398
+ flow_features = np.zeros(10, dtype=np.float16)
399
 
400
+ return flow_features
401
 
402
  except Exception as e:
403
+ print(f"區域性光流計算錯誤: {e}")
404
+ return np.zeros(10, dtype=np.float16)
405
 
406
  def predict_sign_language(video_path):
407
  """預測手語影片"""
 
420
  if len(frames) == 0:
421
  return "錯誤:無法讀取影片幀", 0.0
422
 
423
+ # 提取特徵 - 同時獲取關鍵點和MediaPipe結果
424
  keypoints_sequence = []
425
+ all_results = []
426
+
427
  for frame in frames:
428
+ keypoints, results = extract_keypoints_from_frame(frame)
429
  keypoints_sequence.append(keypoints)
430
+ all_results.append(results)
431
 
432
  # 計算每一幀的光流特徵
433
  flow_features = []
434
  for i in range(len(frames) - 1):
435
+ # 使用當前幀的MediaPipe結果創建遮罩
436
+ current_results = all_results[i]
437
+ if current_results is not None:
438
+ mask = create_hand_mask(
439
+ frames[i],
440
+ current_results.left_hand_landmarks,
441
+ current_results.right_hand_landmarks,
442
+ current_results.pose_landmarks
443
+ )
444
+ else:
445
+ # 如果沒有MediaPipe結果,創建空遮罩
446
+ h, w = frames[i].shape[:2]
447
+ mask = np.zeros((h, w), dtype=np.uint8)
448
+
449
+ flow = compute_regional_optical_flow(frames[i], frames[i + 1], mask)
450
  flow_features.append(flow)
451
 
452
  # 確保光流特徵的幀數與關鍵點一致
 
456
  if flow_features:
457
  flow_features.append(flow_features[-1])
458
  else:
459
+ flow_features.append(np.zeros(10, dtype=np.float16))
460
 
461
  # 確保序列長度為50 (與訓練時一致)
462
  target_length = 50
 
470
  while len(keypoints_sequence) < target_length:
471
  if keypoints_sequence:
472
  keypoints_sequence.append(keypoints_sequence[-1])
473
+ flow_features.append(flow_features[-1] if flow_features else np.zeros(10, dtype=np.float16))
474
  else:
475
+ keypoints_sequence.append(np.zeros(225, dtype=np.float32))
476
+ flow_features.append(np.zeros(10, dtype=np.float16))
477
 
478
  # 轉換為numpy數組再轉為tensor (避免警告)
479
  keypoints_array = np.array(keypoints_sequence, dtype=np.float32)