Spaces:

XiaoBai1221
/

SignView2.0

Sleeping

XiaoBai1221 commited on Jun 23, 2025

Commit

3abf194

1 Parent(s): 3bf3b96

🔧 修復模型架構不匹配問題 + pydantic版本鎖定

- 使用正確的SignLanguageModel架構 (包含keypoint_projection, flow_projection等)
- 修正關鍵點提取維度 (225維: 姿勢99 + 手部126)
- 簡化光流特徵計算 (10維)
- 新增pydantic==2.10.6解決schema錯誤
- 調整序列長度為50 (與訓練時一致)

Files changed (2) hide show

app.py +261 -66
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -3,10 +3,11 @@ import cv2
 import numpy as np
 import torch
 import torch.nn as nn
 import gradio as gr
 from pathlib import Path
 import mediapipe as mp
-import pickle
 # MediaPipe設定
 mp_pose = mp.solutions.pose
@@ -21,46 +22,231 @@ print(f"使用設備: {device}")
 label_to_idx = {'again': 0, 'all': 1, 'apple': 2, 'bad': 3, 'bathroom': 4, 'beautiful': 5, 'bird': 6, 'black': 7, 'blue': 8, 'book': 9, 'bored': 10, 'boy': 11, 'brother': 12, 'brown': 13, 'but': 14, 'computer': 15, 'cousin': 16, 'dance': 17, 'day': 18, 'deaf': 19, 'doctor': 20, 'dog': 21, 'draw': 22, 'drink': 23, 'eat': 24, 'english': 25, 'family': 26, 'father': 27, 'fine': 28, 'finish': 29, 'fish': 30, 'forget': 31, 'friend': 32, 'girl': 33}
 idx_to_label = {v: k for k, v in label_to_idx.items()}
-class BiLSTMWithAttention(nn.Module):
-    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.5):
-        super(BiLSTMWithAttention, self).__init__()
-        self.hidden_size = hidden_size
         self.num_layers = num_layers
-        self.bilstm = nn.LSTM(input_size, hidden_size, num_layers,
-                             batch_first=True, bidirectional=True, dropout=dropout)
-        # 注意力機制
-        self.attention = nn.Linear(hidden_size * 2, 1)
-        # 分類層
-        self.classifier = nn.Linear(hidden_size * 2, num_classes)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x):
-        batch_size = x.size(0)
-        # LSTM前向傳播
-        lstm_out, _ = self.bilstm(x)
-        # 注意力權重計算
-        attention_weights = torch.softmax(self.attention(lstm_out), dim=1)
-        # 加權平均
-        context_vector = torch.sum(attention_weights * lstm_out, dim=1)
-        # 分類
-        output = self.classifier(self.dropout(context_vector))
         return output
 # 初始化模型
-input_size = 258  # keypoints (75*2) + optical_flow (108)
-hidden_size = 256
-num_layers = 3
-num_classes = len(label_to_idx)
-model = BiLSTMWithAttention(input_size, hidden_size, num_layers, num_classes)
 model = model.to(device)
 # 載入模型權重
@@ -85,52 +271,60 @@ def extract_keypoints_from_frame(frame):
     """從單個frame提取關鍵點"""
     try:
         with mp_pose.Pose(static_image_mode=True, model_complexity=1) as pose, \
-             mp_hands.Hands(static_image_mode=True, max_num_hands=2) as hands:
             rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             keypoints = []
-            # 提取姿勢關鍵點
             pose_results = pose.process(rgb_frame)
             if pose_results.pose_landmarks:
                 pose_points = []
                 for landmark in pose_results.pose_landmarks.landmark:
-                    pose_points.extend([landmark.x, landmark.y])
                 keypoints.extend(pose_points)
             else:
-                keypoints.extend([0.0] * 66)  # 33個姿勢點 * 2
-            # 提取手部關鍵點
             hands_results = hands.process(rgb_frame)
             if hands_results.multi_hand_landmarks:
                 hand_points = []
                 for hand_landmarks in hands_results.multi_hand_landmarks:
                     for landmark in hand_landmarks.landmark:
-                        hand_points.extend([landmark.x, landmark.y])
-                if len(hand_points) >= 42:  # 至少一隻手
-                    keypoints.extend(hand_points[:42])
                 else:
-                    keypoints.extend(hand_points + [0.0] * (42 - len(hand_points)))
             else:
-                keypoints.extend([0.0] * 42)  # 21個手部點 * 2
-            return np.array(keypoints, dtype=np.float32)
     except Exception as e:
         print(f"關鍵點提取錯誤: {e}")
-        return np.zeros(150, dtype=np.float32)
 def calculate_optical_flow_features(frames):
     """計算光流特徵"""
     try:
         if len(frames) < 2:
-            return np.zeros(108, dtype=np.float32)
         flow_features = []
-        for i in range(len(frames) - 1):
             gray1 = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
             gray2 = cv2.cvtColor(frames[i + 1], cv2.COLOR_BGR2GRAY)
             flow = cv2.calcOpticalFlowPyrLK(
                 gray1, gray2, None, None,
                 winSize=(15, 15),
@@ -139,17 +333,20 @@ def calculate_optical_flow_features(frames):
             )
             if flow[0] is not None and len(flow[0]) > 0:
-                flow_features.extend(flow[0].flatten()[:54])
             else:
-                flow_features.extend([0.0] * 54)
-        if len(flow_features) >= 108:
-            return np.array(flow_features[:108], dtype=np.float32)
-        else:
-            return np.array(flow_features + [0.0] * (108 - len(flow_features)), dtype=np.float32)
     except Exception as e:
         print(f"光流計算錯誤: {e}")
-        return np.zeros(108, dtype=np.float32)
 def predict_sign_language(video_path):
     """預測手語影片"""
@@ -176,31 +373,29 @@ def predict_sign_language(video_path):
         optical_flow = calculate_optical_flow_features(frames)
-        # 確保序列長度為104
-        target_length = 104
         if len(keypoints_sequence) > target_length:
-            keypoints_sequence = keypoints_sequence[:target_length]
         elif len(keypoints_sequence) < target_length:
-            last_frame = keypoints_sequence[-1] if keypoints_sequence else np.zeros(150)
             while len(keypoints_sequence) < target_length:
                 keypoints_sequence.append(last_frame)
-        # 組合特徵
-        features_sequence = []
-        for i, keypoints in enumerate(keypoints_sequence):
-            if i < len(optical_flow) // 54:
-                flow_feature = optical_flow[i*54:(i+1)*54]
-            else:
-                flow_feature = np.zeros(54)
-            combined_features = np.concatenate([keypoints, flow_feature, np.zeros(54)])
-            features_sequence.append(combined_features)
         # 轉換為tensor並預測
-        features_tensor = torch.tensor([features_sequence], dtype=torch.float32).to(device)
         with torch.no_grad():
-            outputs = model(features_tensor)
             probabilities = torch.softmax(outputs, dim=1)
             predicted_class = torch.argmax(probabilities, dim=1).item()
             confidence = probabilities[0][predicted_class].item()
@@ -240,7 +435,7 @@ demo = gr.Interface(
     **系統特色：**
     - 🎯 準確率：94.25%
     - 📚 支援34種手語詞彙
-    - 🧠 使用BiLSTM + 注意力機制
     - 👁️ MediaPipe + 光流特徵融合
     **使用方法：**

 import numpy as np
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import gradio as gr
 from pathlib import Path
 import mediapipe as mp
+import json
 # MediaPipe設定
 mp_pose = mp.solutions.pose
 label_to_idx = {'again': 0, 'all': 1, 'apple': 2, 'bad': 3, 'bathroom': 4, 'beautiful': 5, 'bird': 6, 'black': 7, 'blue': 8, 'book': 9, 'bored': 10, 'boy': 11, 'brother': 12, 'brown': 13, 'but': 14, 'computer': 15, 'cousin': 16, 'dance': 17, 'day': 18, 'deaf': 19, 'doctor': 20, 'dog': 21, 'draw': 22, 'drink': 23, 'eat': 24, 'english': 25, 'family': 26, 'father': 27, 'fine': 28, 'finish': 29, 'fish': 30, 'forget': 31, 'friend': 32, 'girl': 33}
 idx_to_label = {v: k for k, v in label_to_idx.items()}
+class SignLanguageModel(nn.Module):
+    """Sign Language Recognition Model"""
+    def __init__(self, input_dim, hidden_dim, num_layers, num_classes, dropout=0.5, flow_dim=10):
+        super(SignLanguageModel, self).__init__()
+        self.hidden_dim = hidden_dim
         self.num_layers = num_layers
+        self.num_classes = num_classes
+        # Keypoint feature projection
+        self.keypoint_projection = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.BatchNorm1d(hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout/2),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.BatchNorm1d(hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout/2)
+        )
+        # Flow feature projection
+        self.flow_projection = nn.Sequential(
+            nn.Linear(flow_dim, hidden_dim // 2),
+            nn.BatchNorm1d(hidden_dim // 2),
+            nn.ReLU(),
+            nn.Dropout(dropout/2),
+            nn.Linear(hidden_dim // 2, hidden_dim // 2),
+            nn.BatchNorm1d(hidden_dim // 2),
+            nn.ReLU(),
+            nn.Dropout(dropout/2)
+        )
+        # Feature fusion
+        self.fusion_layer = nn.Sequential(
+            nn.Linear(hidden_dim + (hidden_dim // 2), hidden_dim),
+            nn.BatchNorm1d(hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout/2)
+        )
+        # Bidirectional LSTM
+        self.lstm = nn.LSTM(
+            input_size=hidden_dim,
+            hidden_size=hidden_dim,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=dropout if num_layers > 1 else 0,
+            bidirectional=True
+        )
+        # GRU for additional temporal features
+        self.gru = nn.GRU(
+            input_size=hidden_dim * 2,
+            hidden_size=hidden_dim,
+            num_layers=1,
+            batch_first=True,
+            bidirectional=True
+        )
+        # Batch normalization
+        self.lstm_bn = nn.BatchNorm1d(hidden_dim * 2)
+        self.gru_bn = nn.BatchNorm1d(hidden_dim * 2)
+        # Multi-head attention
+        self.multihead_attn = nn.MultiheadAttention(
+            embed_dim=hidden_dim * 2,
+            num_heads=4,
+            dropout=dropout,
+            batch_first=True
+        )
+        # Attention mechanism
+        self.attention = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.Tanh(),
+            nn.Linear(hidden_dim, 1),
+            nn.Softmax(dim=1)
+        )
+        # Classifier
+        self.classifier = nn.Sequential(
+            nn.Linear(hidden_dim * 4, hidden_dim * 2),
+            nn.BatchNorm1d(hidden_dim * 2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.BatchNorm1d(hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout/2),
+            nn.Linear(hidden_dim, num_classes)
+        )
+        self._init_weights()
+    def _init_weights(self):
+        """Initialize model weights"""
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.LSTM, nn.GRU)):
+                for name, param in m.named_parameters():
+                    if 'weight' in name:
+                        nn.init.orthogonal_(param)
+                    elif 'bias' in name:
+                        nn.init.zeros_(param)
+    def forward(self, keypoints, flow=None):
+        """Forward pass"""
+        batch_size, seq_len, _ = keypoints.size()
+        # Process keypoint features
+        kp_reshaped = keypoints.reshape(-1, keypoints.size(-1))
+        # First layer
+        kp_projected = self.keypoint_projection[0](kp_reshaped)
+        kp_projected = kp_projected.reshape(batch_size, seq_len, -1)
+        kp_projected = kp_projected.transpose(1, 2)
+        kp_projected = self.keypoint_projection[1](kp_projected)
+        kp_projected = kp_projected.transpose(1, 2)
+        kp_projected = self.keypoint_projection[2](kp_projected)
+        kp_projected = self.keypoint_projection[3](kp_projected)
+        # Second layer
+        kp_projected_reshaped = kp_projected.reshape(-1, kp_projected.size(-1))
+        kp_projected = self.keypoint_projection[4](kp_projected_reshaped)
+        kp_projected = kp_projected.reshape(batch_size, seq_len, -1)
+        kp_projected = kp_projected.transpose(1, 2)
+        kp_projected = self.keypoint_projection[5](kp_projected)
+        kp_projected = kp_projected.transpose(1, 2)
+        kp_projected = self.keypoint_projection[6](kp_projected)
+        kp_projected = self.keypoint_projection[7](kp_projected)
+        # Process flow features if provided
+        if flow is not None:
+            flow_reshaped = flow.reshape(-1, flow.size(-1))
+            # First layer
+            flow_projected = self.flow_projection[0](flow_reshaped)
+            flow_projected = flow_projected.reshape(batch_size, seq_len, -1)
+            flow_projected = flow_projected.transpose(1, 2)
+            flow_projected = self.flow_projection[1](flow_projected)
+            flow_projected = flow_projected.transpose(1, 2)
+            flow_projected = self.flow_projection[2](flow_projected)
+            flow_projected = self.flow_projection[3](flow_projected)
+            # Second layer
+            flow_projected_reshaped = flow_projected.reshape(-1, flow_projected.size(-1))
+            flow_projected = self.flow_projection[4](flow_projected_reshaped)
+            flow_projected = flow_projected.reshape(batch_size, seq_len, -1)
+            flow_projected = flow_projected.transpose(1, 2)
+            flow_projected = self.flow_projection[5](flow_projected)
+            flow_projected = flow_projected.transpose(1, 2)
+            flow_projected = self.flow_projection[6](flow_projected)
+            flow_projected = self.flow_projection[7](flow_projected)
+            # Feature fusion
+            combined_features = torch.cat([kp_projected, flow_projected], dim=2)
+            combined_reshaped = combined_features.reshape(-1, combined_features.size(-1))
+            fused_features = self.fusion_layer[0](combined_reshaped)
+            fused_features = fused_features.reshape(batch_size, seq_len, -1)
+            fused_features = fused_features.transpose(1, 2)
+            fused_features = self.fusion_layer[1](fused_features)
+            fused_features = fused_features.transpose(1, 2)
+            fused_features = self.fusion_layer[2](fused_features)
+            fused_features = self.fusion_layer[3](fused_features)
+            x_projected = fused_features
+        else:
+            x_projected = kp_projected
+        # Residual connection
+        x_residual = x_projected
+        # LSTM processing
+        lstm_out, _ = self.lstm(x_projected)
+        # Residual connection
+        x_residual_expanded = torch.cat([x_residual, x_residual], dim=2)
+        lstm_out_with_residual = lstm_out + x_residual_expanded
+        # BatchNorm for LSTM output
+        lstm_out_bn = lstm_out_with_residual.transpose(1, 2)
+        lstm_out_bn = self.lstm_bn(lstm_out_bn)
+        lstm_out = lstm_out_bn.transpose(1, 2)
+        # GRU processing
+        gru_out, _ = self.gru(lstm_out)
+        # BatchNorm for GRU output
+        gru_out_bn = gru_out.transpose(1, 2)
+        gru_out_bn = self.gru_bn(gru_out_bn)
+        gru_out = gru_out_bn.transpose(1, 2)
+        # Multi-head attention
+        attn_output, _ = self.multihead_attn(lstm_out, lstm_out, lstm_out)
+        # Traditional attention
+        attention_weights = self.attention(gru_out)
+        context_gru = torch.bmm(gru_out.transpose(1, 2), attention_weights)
+        context_gru = context_gru.squeeze(-1)
+        attention_weights_attn = self.attention(attn_output)
+        context_attn = torch.bmm(attn_output.transpose(1, 2), attention_weights_attn)
+        context_attn = context_attn.squeeze(-1)
+        # Combine contexts
+        combined_context = torch.cat([context_gru, context_attn], dim=1)
+        # Final classification
+        output = self.classifier(combined_context)
         return output
 # 初始化模型
+model = SignLanguageModel(
+    input_dim=225,  # keypoint dimension
+    hidden_dim=256,
+    num_layers=2,
+    num_classes=len(label_to_idx),
+    dropout=0.5,
+    flow_dim=10
+)
 model = model.to(device)
 # 載入模型權重
     """從單個frame提取關鍵點"""
     try:
         with mp_pose.Pose(static_image_mode=True, model_complexity=1) as pose, \
+             mp_hands.Hands(static_image_mode=True, max_num_hands=2) as hands, \
+             mp_face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1) as face_mesh:
             rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             keypoints = []
+            # 提取姿勢關鍵點 (33個點 * 3維 = 99)
             pose_results = pose.process(rgb_frame)
             if pose_results.pose_landmarks:
                 pose_points = []
                 for landmark in pose_results.pose_landmarks.landmark:
+                    pose_points.extend([landmark.x, landmark.y, landmark.z])
                 keypoints.extend(pose_points)
             else:
+                keypoints.extend([0.0] * 99)
+            # 提取手部關鍵點 (21個點 * 2隻手 * 3維 = 126)
             hands_results = hands.process(rgb_frame)
             if hands_results.multi_hand_landmarks:
                 hand_points = []
                 for hand_landmarks in hands_results.multi_hand_landmarks:
                     for landmark in hand_landmarks.landmark:
+                        hand_points.extend([landmark.x, landmark.y, landmark.z])
+                # 確保有126個手部關鍵點 (2隻手)
+                if len(hand_points) >= 126:
+                    keypoints.extend(hand_points[:126])
                 else:
+                    keypoints.extend(hand_points + [0.0] * (126 - len(hand_points)))
             else:
+                keypoints.extend([0.0] * 126)
+            # 如果需要，確保總共225個特徵
+            while len(keypoints) < 225:
+                keypoints.append(0.0)
+            return np.array(keypoints[:225], dtype=np.float32)
     except Exception as e:
         print(f"關鍵點提取錯誤: {e}")
+        return np.zeros(225, dtype=np.float32)
 def calculate_optical_flow_features(frames):
     """計算光流特徵"""
     try:
         if len(frames) < 2:
+            return np.zeros(10, dtype=np.float32)
         flow_features = []
+        for i in range(min(len(frames) - 1, 10)):  # 最多計算10個光流
             gray1 = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
             gray2 = cv2.cvtColor(frames[i + 1], cv2.COLOR_BGR2GRAY)
+            # 計算光流
             flow = cv2.calcOpticalFlowPyrLK(
                 gray1, gray2, None, None,
                 winSize=(15, 15),
             )
             if flow[0] is not None and len(flow[0]) > 0:
+                # 計算光流的平均大小
+                flow_magnitude = np.mean(np.sqrt(flow[0].flatten()**2))
+                flow_features.append(flow_magnitude)
             else:
+                flow_features.append(0.0)
+        # 確保有10個光流特徵
+        while len(flow_features) < 10:
+            flow_features.append(0.0)
+        return np.array(flow_features[:10], dtype=np.float32)
     except Exception as e:
         print(f"光流計算錯誤: {e}")
+        return np.zeros(10, dtype=np.float32)
 def predict_sign_language(video_path):
     """預測手語影片"""
         optical_flow = calculate_optical_flow_features(frames)
+        # 確保序列長度為50 (與訓練時一致)
+        target_length = 50
         if len(keypoints_sequence) > target_length:
+            # 均勻採樣
+            indices = np.linspace(0, len(keypoints_sequence)-1, target_length, dtype=int)
+            keypoints_sequence = [keypoints_sequence[i] for i in indices]
         elif len(keypoints_sequence) < target_length:
+            # 重複最後一幀
+            last_frame = keypoints_sequence[-1] if keypoints_sequence else np.zeros(225)
             while len(keypoints_sequence) < target_length:
                 keypoints_sequence.append(last_frame)
+        # 為每個時間步創建光流特徵
+        flow_sequence = []
+        for i in range(target_length):
+            flow_sequence.append(optical_flow)
         # 轉換為tensor並預測
+        keypoints_tensor = torch.tensor([keypoints_sequence], dtype=torch.float32).to(device)
+        flow_tensor = torch.tensor([flow_sequence], dtype=torch.float32).to(device)
         with torch.no_grad():
+            outputs = model(keypoints_tensor, flow_tensor)
             probabilities = torch.softmax(outputs, dim=1)
             predicted_class = torch.argmax(probabilities, dim=1).item()
             confidence = probabilities[0][predicted_class].item()
     **系統特色：**
     - 🎯 準確率：94.25%
     - 📚 支援34種手語詞彙
+    - 🧠 使用BiLSTM + GRU + 多頭注意力機制
     - 👁️ MediaPipe + 光流特徵融合
     **使用方法：**

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ opencv-python>=4.8.0
 mediapipe>=0.10.0
 numpy>=1.24.0
 Pillow>=9.5.0
-scipy>=1.10.0

 mediapipe>=0.10.0
 numpy>=1.24.0
 Pillow>=9.5.0
+scipy>=1.10.0
+pydantic==2.10.6