Spaces:
Sleeping
Sleeping
Commit
·
de897a8
1
Parent(s):
2101309
🎯 修正特徵提取以匹配訓練時的方法
Browse files**核心修正:**
- 使用 mp.solutions.holistic.Holistic (與訓練一致)
- 修正關鍵點順序:手部(左+右) + 姿勢 (63+63+99=225)
- 實現與訓練時相同的區域性光流計算
- 添加手部遮罩創建 (create_hand_mask)
- 使用 calcOpticalFlowFarneback 替代 calcOpticalFlowPyrLK
**技術改進:**
- 每幀同時提取關鍵點和MediaPipe結果
- 基於實際檢測到的關鍵點創建ROI遮罩
- 統一數據類型為 np.float32/np.float16
- 與sign_language_recognition.py保持完全一致
**預期效果:**
- 大幅提升預測準確率
- 特徵提取與訓練時100%匹配
app.py
CHANGED
|
@@ -268,102 +268,140 @@ else:
|
|
| 268 |
raise FileNotFoundError(f"模型檔案不存在: {model_path}")
|
| 269 |
|
| 270 |
def extract_keypoints_from_frame(frame):
|
| 271 |
-
"""從單個frame提取關鍵點"""
|
| 272 |
try:
|
| 273 |
-
with
|
| 274 |
-
|
| 275 |
-
|
|
|
|
|
|
|
| 276 |
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
keypoints = []
|
| 280 |
|
| 281 |
-
#
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
for landmark in pose_results.pose_landmarks.landmark:
|
| 286 |
-
pose_points.extend([landmark.x, landmark.y, landmark.z])
|
| 287 |
-
keypoints.extend(pose_points)
|
| 288 |
else:
|
| 289 |
-
keypoints.extend([0
|
| 290 |
-
|
| 291 |
-
# 提取手部關鍵點 (21個點 * 2隻手 * 3維 = 126)
|
| 292 |
-
hands_results = hands.process(rgb_frame)
|
| 293 |
-
if hands_results.multi_hand_landmarks:
|
| 294 |
-
hand_points = []
|
| 295 |
-
for hand_landmarks in hands_results.multi_hand_landmarks:
|
| 296 |
-
for landmark in hand_landmarks.landmark:
|
| 297 |
-
hand_points.extend([landmark.x, landmark.y, landmark.z])
|
| 298 |
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
keypoints.extend(hand_points + [0.0] * (126 - len(hand_points)))
|
| 304 |
else:
|
| 305 |
-
keypoints.extend([0
|
| 306 |
|
| 307 |
-
#
|
| 308 |
-
|
| 309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
-
return np.array(keypoints[:225], dtype=np.float32)
|
| 312 |
except Exception as e:
|
| 313 |
print(f"關鍵點提取錯誤: {e}")
|
| 314 |
-
return np.zeros(225, dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
|
| 316 |
-
def
|
| 317 |
-
"""
|
| 318 |
try:
|
| 319 |
-
#
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
# 計算統計特徵
|
| 348 |
-
magnitude = np.sqrt(flow_vectors[:, 0]**2 + flow_vectors[:, 1]**2)
|
| 349 |
-
direction = np.arctan2(flow_vectors[:, 1], flow_vectors[:, 0])
|
| 350 |
|
| 351 |
-
#
|
| 352 |
-
|
| 353 |
-
np.mean(magnitude), np.std(magnitude), np.max(magnitude), np.min(magnitude),
|
| 354 |
-
np.mean(direction), np.std(direction),
|
| 355 |
-
np.mean(flow_vectors[:, 0]), np.std(flow_vectors[:, 0]),
|
| 356 |
-
np.mean(flow_vectors[:, 1]), np.std(flow_vectors[:, 1])
|
| 357 |
-
]
|
| 358 |
|
| 359 |
-
#
|
| 360 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
|
| 362 |
-
return
|
| 363 |
|
| 364 |
except Exception as e:
|
| 365 |
-
print(f"
|
| 366 |
-
return np.zeros(10)
|
| 367 |
|
| 368 |
def predict_sign_language(video_path):
|
| 369 |
"""預測手語影片"""
|
|
@@ -382,16 +420,33 @@ def predict_sign_language(video_path):
|
|
| 382 |
if len(frames) == 0:
|
| 383 |
return "錯誤:無法讀取影片幀", 0.0
|
| 384 |
|
| 385 |
-
# 提取特徵
|
| 386 |
keypoints_sequence = []
|
|
|
|
|
|
|
| 387 |
for frame in frames:
|
| 388 |
-
keypoints = extract_keypoints_from_frame(frame)
|
| 389 |
keypoints_sequence.append(keypoints)
|
|
|
|
| 390 |
|
| 391 |
# 計算每一幀的光流特徵
|
| 392 |
flow_features = []
|
| 393 |
for i in range(len(frames) - 1):
|
| 394 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
flow_features.append(flow)
|
| 396 |
|
| 397 |
# 確保光流特徵的幀數與關鍵點一致
|
|
@@ -401,7 +456,7 @@ def predict_sign_language(video_path):
|
|
| 401 |
if flow_features:
|
| 402 |
flow_features.append(flow_features[-1])
|
| 403 |
else:
|
| 404 |
-
flow_features.append(np.zeros(10))
|
| 405 |
|
| 406 |
# 確保序列長度為50 (與訓練時一致)
|
| 407 |
target_length = 50
|
|
@@ -415,10 +470,10 @@ def predict_sign_language(video_path):
|
|
| 415 |
while len(keypoints_sequence) < target_length:
|
| 416 |
if keypoints_sequence:
|
| 417 |
keypoints_sequence.append(keypoints_sequence[-1])
|
| 418 |
-
flow_features.append(flow_features[-1] if flow_features else np.zeros(10))
|
| 419 |
else:
|
| 420 |
-
keypoints_sequence.append(np.zeros(225))
|
| 421 |
-
flow_features.append(np.zeros(10))
|
| 422 |
|
| 423 |
# 轉換為numpy數組再轉為tensor (避免警告)
|
| 424 |
keypoints_array = np.array(keypoints_sequence, dtype=np.float32)
|
|
|
|
| 268 |
raise FileNotFoundError(f"模型檔案不存在: {model_path}")
|
| 269 |
|
| 270 |
def extract_keypoints_from_frame(frame):
|
| 271 |
+
"""從單個frame提取關鍵點 - 與訓練時一致"""
|
| 272 |
try:
|
| 273 |
+
with mp.solutions.holistic.Holistic(
|
| 274 |
+
static_image_mode=True,
|
| 275 |
+
model_complexity=1,
|
| 276 |
+
min_detection_confidence=0.5,
|
| 277 |
+
min_tracking_confidence=0.5) as holistic:
|
| 278 |
|
| 279 |
+
# 轉換為RGB格式
|
| 280 |
+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 281 |
+
frame_rgb.flags.writeable = False
|
| 282 |
+
results = holistic.process(frame_rgb)
|
| 283 |
+
frame_rgb.flags.writeable = True
|
| 284 |
|
| 285 |
keypoints = []
|
| 286 |
|
| 287 |
+
# 提取手部關鍵點 (左手: 21個點 * 3維 = 63)
|
| 288 |
+
if results.left_hand_landmarks:
|
| 289 |
+
for landmark in results.left_hand_landmarks.landmark:
|
| 290 |
+
keypoints.extend([landmark.x, landmark.y, landmark.z])
|
|
|
|
|
|
|
|
|
|
| 291 |
else:
|
| 292 |
+
keypoints.extend([0] * (21 * 3))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
|
| 294 |
+
# 提取手部關鍵點 (右手: 21個點 * 3維 = 63)
|
| 295 |
+
if results.right_hand_landmarks:
|
| 296 |
+
for landmark in results.right_hand_landmarks.landmark:
|
| 297 |
+
keypoints.extend([landmark.x, landmark.y, landmark.z])
|
|
|
|
| 298 |
else:
|
| 299 |
+
keypoints.extend([0] * (21 * 3))
|
| 300 |
|
| 301 |
+
# 提取姿勢關鍵點 (33個點 * 3維 = 99)
|
| 302 |
+
if results.pose_landmarks:
|
| 303 |
+
for landmark in results.pose_landmarks.landmark:
|
| 304 |
+
keypoints.extend([landmark.x, landmark.y, landmark.z])
|
| 305 |
+
else:
|
| 306 |
+
keypoints.extend([0] * (33 * 3))
|
| 307 |
+
|
| 308 |
+
return np.array(keypoints[:225], dtype=np.float32), results
|
| 309 |
|
|
|
|
| 310 |
except Exception as e:
|
| 311 |
print(f"關鍵點提取錯誤: {e}")
|
| 312 |
+
return np.zeros(225, dtype=np.float32), None
|
| 313 |
+
|
| 314 |
+
def create_hand_mask(frame, left_hand_landmarks, right_hand_landmarks, pose_landmarks):
|
| 315 |
+
"""創建手部和上半身的ROI遮罩 - 與訓練時一致"""
|
| 316 |
+
h, w = frame.shape[:2]
|
| 317 |
+
mask = np.zeros((h, w), dtype=np.uint8)
|
| 318 |
+
|
| 319 |
+
def draw_landmarks_on_mask(landmarks, radius=15):
|
| 320 |
+
if landmarks:
|
| 321 |
+
for landmark in landmarks.landmark:
|
| 322 |
+
x, y = int(landmark.x * w), int(landmark.y * h)
|
| 323 |
+
if 0 <= x < w and 0 <= y < h:
|
| 324 |
+
cv2.circle(mask, (x, y), radius=radius, color=255, thickness=-1)
|
| 325 |
+
|
| 326 |
+
# 繪製左手關鍵點
|
| 327 |
+
draw_landmarks_on_mask(left_hand_landmarks, radius=20)
|
| 328 |
+
|
| 329 |
+
# 繪製右手關鍵點
|
| 330 |
+
draw_landmarks_on_mask(right_hand_landmarks, radius=20)
|
| 331 |
+
|
| 332 |
+
# 只繪製上半身關鍵點 (頭部、肩膀、手臂)
|
| 333 |
+
if pose_landmarks:
|
| 334 |
+
upper_body_indices = list(range(0, 25)) # 0-24為上半身關鍵點
|
| 335 |
+
for idx in upper_body_indices:
|
| 336 |
+
if idx < len(pose_landmarks.landmark):
|
| 337 |
+
landmark = pose_landmarks.landmark[idx]
|
| 338 |
+
x, y = int(landmark.x * w), int(landmark.y * h)
|
| 339 |
+
if 0 <= x < w and 0 <= y < h:
|
| 340 |
+
cv2.circle(mask, (x, y), radius=10, color=255, thickness=-1)
|
| 341 |
+
|
| 342 |
+
# 擴大遮罩區域,使用膨脹操作
|
| 343 |
+
kernel = np.ones((15, 15), np.uint8)
|
| 344 |
+
dilated_mask = cv2.dilate(mask, kernel, iterations=1)
|
| 345 |
+
|
| 346 |
+
return dilated_mask
|
| 347 |
|
| 348 |
+
def compute_regional_optical_flow(prev_frame, curr_frame, mask, downscale=0.5):
|
| 349 |
+
"""計算區域性光流特徵 - 與訓練時一致"""
|
| 350 |
try:
|
| 351 |
+
# 降低解析度
|
| 352 |
+
if downscale < 1.0:
|
| 353 |
+
h, w = prev_frame.shape[:2]
|
| 354 |
+
new_h, new_w = int(h * downscale), int(w * downscale)
|
| 355 |
+
prev_small = cv2.resize(prev_frame, (new_w, new_h))
|
| 356 |
+
curr_small = cv2.resize(curr_frame, (new_w, new_h))
|
| 357 |
+
mask_small = cv2.resize(mask, (new_w, new_h))
|
| 358 |
+
else:
|
| 359 |
+
prev_small = prev_frame
|
| 360 |
+
curr_small = curr_frame
|
| 361 |
+
mask_small = mask
|
| 362 |
+
|
| 363 |
+
# 轉換為灰度圖
|
| 364 |
+
prev_gray = cv2.cvtColor(prev_small, cv2.COLOR_BGR2GRAY)
|
| 365 |
+
curr_gray = cv2.cvtColor(curr_small, cv2.COLOR_BGR2GRAY)
|
| 366 |
+
|
| 367 |
+
# 計算遮罩區域的光流
|
| 368 |
+
flow = cv2.calcOpticalFlowFarneback(
|
| 369 |
+
prev_gray, curr_gray,
|
| 370 |
+
None, # flow
|
| 371 |
+
0.5, # pyr_scale
|
| 372 |
+
3, # levels
|
| 373 |
+
15, # winsize
|
| 374 |
+
3, # iterations
|
| 375 |
+
5, # poly_n
|
| 376 |
+
1.2, # poly_sigma
|
| 377 |
+
0 # flags
|
| 378 |
+
)
|
|
|
|
|
|
|
|
|
|
| 379 |
|
| 380 |
+
# 將mask_small轉換為布爾遮罩
|
| 381 |
+
bool_mask = mask_small > 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
|
| 383 |
+
# 只計算遮罩區域的光流特徵
|
| 384 |
+
if np.any(bool_mask):
|
| 385 |
+
# 提取x和y方向的光流
|
| 386 |
+
fx = flow[..., 0][bool_mask]
|
| 387 |
+
fy = flow[..., 1][bool_mask]
|
| 388 |
+
|
| 389 |
+
# 計算統計特徵
|
| 390 |
+
flow_features = np.array([
|
| 391 |
+
np.mean(fx), np.std(fx),
|
| 392 |
+
np.mean(fy), np.std(fy),
|
| 393 |
+
np.percentile(fx, 25), np.percentile(fx, 75),
|
| 394 |
+
np.percentile(fy, 25), np.percentile(fy, 75),
|
| 395 |
+
np.max(np.abs(fx)), np.max(np.abs(fy))
|
| 396 |
+
], dtype=np.float16)
|
| 397 |
+
else:
|
| 398 |
+
flow_features = np.zeros(10, dtype=np.float16)
|
| 399 |
|
| 400 |
+
return flow_features
|
| 401 |
|
| 402 |
except Exception as e:
|
| 403 |
+
print(f"區域性光流計算錯誤: {e}")
|
| 404 |
+
return np.zeros(10, dtype=np.float16)
|
| 405 |
|
| 406 |
def predict_sign_language(video_path):
|
| 407 |
"""預測手語影片"""
|
|
|
|
| 420 |
if len(frames) == 0:
|
| 421 |
return "錯誤:無法讀取影片幀", 0.0
|
| 422 |
|
| 423 |
+
# 提取特徵 - 同時獲取關鍵點和MediaPipe結果
|
| 424 |
keypoints_sequence = []
|
| 425 |
+
all_results = []
|
| 426 |
+
|
| 427 |
for frame in frames:
|
| 428 |
+
keypoints, results = extract_keypoints_from_frame(frame)
|
| 429 |
keypoints_sequence.append(keypoints)
|
| 430 |
+
all_results.append(results)
|
| 431 |
|
| 432 |
# 計算每一幀的光流特徵
|
| 433 |
flow_features = []
|
| 434 |
for i in range(len(frames) - 1):
|
| 435 |
+
# 使用當前幀的MediaPipe結果創建遮罩
|
| 436 |
+
current_results = all_results[i]
|
| 437 |
+
if current_results is not None:
|
| 438 |
+
mask = create_hand_mask(
|
| 439 |
+
frames[i],
|
| 440 |
+
current_results.left_hand_landmarks,
|
| 441 |
+
current_results.right_hand_landmarks,
|
| 442 |
+
current_results.pose_landmarks
|
| 443 |
+
)
|
| 444 |
+
else:
|
| 445 |
+
# 如果沒有MediaPipe結果,創建空遮罩
|
| 446 |
+
h, w = frames[i].shape[:2]
|
| 447 |
+
mask = np.zeros((h, w), dtype=np.uint8)
|
| 448 |
+
|
| 449 |
+
flow = compute_regional_optical_flow(frames[i], frames[i + 1], mask)
|
| 450 |
flow_features.append(flow)
|
| 451 |
|
| 452 |
# 確保光流特徵的幀數與關鍵點一致
|
|
|
|
| 456 |
if flow_features:
|
| 457 |
flow_features.append(flow_features[-1])
|
| 458 |
else:
|
| 459 |
+
flow_features.append(np.zeros(10, dtype=np.float16))
|
| 460 |
|
| 461 |
# 確保序列長度為50 (與訓練時一致)
|
| 462 |
target_length = 50
|
|
|
|
| 470 |
while len(keypoints_sequence) < target_length:
|
| 471 |
if keypoints_sequence:
|
| 472 |
keypoints_sequence.append(keypoints_sequence[-1])
|
| 473 |
+
flow_features.append(flow_features[-1] if flow_features else np.zeros(10, dtype=np.float16))
|
| 474 |
else:
|
| 475 |
+
keypoints_sequence.append(np.zeros(225, dtype=np.float32))
|
| 476 |
+
flow_features.append(np.zeros(10, dtype=np.float16))
|
| 477 |
|
| 478 |
# 轉換為numpy數組再轉為tensor (避免警告)
|
| 479 |
keypoints_array = np.array(keypoints_sequence, dtype=np.float32)
|