XiaoBai1221 commited on
Commit
3abf194
·
1 Parent(s): 3bf3b96

🔧 修復模型架構不匹配問題 + pydantic版本鎖定

Browse files

- 使用正確的SignLanguageModel架構 (包含keypoint_projection, flow_projection等)
- 修正關鍵點提取維度 (225維: 姿勢99 + 手部126)
- 簡化光流特徵計算 (10維)
- 新增pydantic==2.10.6解決schema錯誤
- 調整序列長度為50 (與訓練時一致)

Files changed (2) hide show
  1. app.py +261 -66
  2. requirements.txt +2 -1
app.py CHANGED
@@ -3,10 +3,11 @@ import cv2
3
  import numpy as np
4
  import torch
5
  import torch.nn as nn
 
6
  import gradio as gr
7
  from pathlib import Path
8
  import mediapipe as mp
9
- import pickle
10
 
11
  # MediaPipe設定
12
  mp_pose = mp.solutions.pose
@@ -21,46 +22,231 @@ print(f"使用設備: {device}")
21
  label_to_idx = {'again': 0, 'all': 1, 'apple': 2, 'bad': 3, 'bathroom': 4, 'beautiful': 5, 'bird': 6, 'black': 7, 'blue': 8, 'book': 9, 'bored': 10, 'boy': 11, 'brother': 12, 'brown': 13, 'but': 14, 'computer': 15, 'cousin': 16, 'dance': 17, 'day': 18, 'deaf': 19, 'doctor': 20, 'dog': 21, 'draw': 22, 'drink': 23, 'eat': 24, 'english': 25, 'family': 26, 'father': 27, 'fine': 28, 'finish': 29, 'fish': 30, 'forget': 31, 'friend': 32, 'girl': 33}
22
  idx_to_label = {v: k for k, v in label_to_idx.items()}
23
 
24
- class BiLSTMWithAttention(nn.Module):
25
- def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.5):
26
- super(BiLSTMWithAttention, self).__init__()
27
- self.hidden_size = hidden_size
 
28
  self.num_layers = num_layers
 
29
 
30
- self.bilstm = nn.LSTM(input_size, hidden_size, num_layers,
31
- batch_first=True, bidirectional=True, dropout=dropout)
 
 
 
 
 
 
 
 
 
32
 
33
- # 注意力機制
34
- self.attention = nn.Linear(hidden_size * 2, 1)
 
 
 
 
 
 
 
 
 
35
 
36
- # 分類層
37
- self.classifier = nn.Linear(hidden_size * 2, num_classes)
38
- self.dropout = nn.Dropout(dropout)
 
 
 
 
39
 
40
- def forward(self, x):
41
- batch_size = x.size(0)
 
 
 
 
 
 
 
42
 
43
- # LSTM前向傳播
44
- lstm_out, _ = self.bilstm(x)
 
 
 
 
 
 
45
 
46
- # 注意力權重計算
47
- attention_weights = torch.softmax(self.attention(lstm_out), dim=1)
 
48
 
49
- # 加權平均
50
- context_vector = torch.sum(attention_weights * lstm_out, dim=1)
 
 
 
 
 
51
 
52
- # 分類
53
- output = self.classifier(self.dropout(context_vector))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  return output
56
 
57
  # 初始化模型
58
- input_size = 258 # keypoints (75*2) + optical_flow (108)
59
- hidden_size = 256
60
- num_layers = 3
61
- num_classes = len(label_to_idx)
62
-
63
- model = BiLSTMWithAttention(input_size, hidden_size, num_layers, num_classes)
 
 
64
  model = model.to(device)
65
 
66
  # 載入模型權重
@@ -85,52 +271,60 @@ def extract_keypoints_from_frame(frame):
85
  """從單個frame提取關鍵點"""
86
  try:
87
  with mp_pose.Pose(static_image_mode=True, model_complexity=1) as pose, \
88
- mp_hands.Hands(static_image_mode=True, max_num_hands=2) as hands:
 
89
 
90
  rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
91
 
92
  keypoints = []
93
 
94
- # 提取姿勢關鍵點
95
  pose_results = pose.process(rgb_frame)
96
  if pose_results.pose_landmarks:
97
  pose_points = []
98
  for landmark in pose_results.pose_landmarks.landmark:
99
- pose_points.extend([landmark.x, landmark.y])
100
  keypoints.extend(pose_points)
101
  else:
102
- keypoints.extend([0.0] * 66) # 33個姿勢點 * 2
103
 
104
- # 提取手部關鍵點
105
  hands_results = hands.process(rgb_frame)
106
  if hands_results.multi_hand_landmarks:
107
  hand_points = []
108
  for hand_landmarks in hands_results.multi_hand_landmarks:
109
  for landmark in hand_landmarks.landmark:
110
- hand_points.extend([landmark.x, landmark.y])
111
- if len(hand_points) >= 42: # 至少一隻手
112
- keypoints.extend(hand_points[:42])
 
 
113
  else:
114
- keypoints.extend(hand_points + [0.0] * (42 - len(hand_points)))
115
  else:
116
- keypoints.extend([0.0] * 42) # 21個手部點 * 2
 
 
 
 
117
 
118
- return np.array(keypoints, dtype=np.float32)
119
  except Exception as e:
120
  print(f"關鍵點提取錯誤: {e}")
121
- return np.zeros(150, dtype=np.float32)
122
 
123
  def calculate_optical_flow_features(frames):
124
  """計算光流特徵"""
125
  try:
126
  if len(frames) < 2:
127
- return np.zeros(108, dtype=np.float32)
128
 
129
  flow_features = []
130
- for i in range(len(frames) - 1):
131
  gray1 = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
132
  gray2 = cv2.cvtColor(frames[i + 1], cv2.COLOR_BGR2GRAY)
133
 
 
134
  flow = cv2.calcOpticalFlowPyrLK(
135
  gray1, gray2, None, None,
136
  winSize=(15, 15),
@@ -139,17 +333,20 @@ def calculate_optical_flow_features(frames):
139
  )
140
 
141
  if flow[0] is not None and len(flow[0]) > 0:
142
- flow_features.extend(flow[0].flatten()[:54])
 
 
143
  else:
144
- flow_features.extend([0.0] * 54)
145
 
146
- if len(flow_features) >= 108:
147
- return np.array(flow_features[:108], dtype=np.float32)
148
- else:
149
- return np.array(flow_features + [0.0] * (108 - len(flow_features)), dtype=np.float32)
 
150
  except Exception as e:
151
  print(f"光流計算錯誤: {e}")
152
- return np.zeros(108, dtype=np.float32)
153
 
154
  def predict_sign_language(video_path):
155
  """預測手語影片"""
@@ -176,31 +373,29 @@ def predict_sign_language(video_path):
176
 
177
  optical_flow = calculate_optical_flow_features(frames)
178
 
179
- # 確保序列長度為104
180
- target_length = 104
181
  if len(keypoints_sequence) > target_length:
182
- keypoints_sequence = keypoints_sequence[:target_length]
 
 
183
  elif len(keypoints_sequence) < target_length:
184
- last_frame = keypoints_sequence[-1] if keypoints_sequence else np.zeros(150)
 
185
  while len(keypoints_sequence) < target_length:
186
  keypoints_sequence.append(last_frame)
187
 
188
- # 組合特徵
189
- features_sequence = []
190
- for i, keypoints in enumerate(keypoints_sequence):
191
- if i < len(optical_flow) // 54:
192
- flow_feature = optical_flow[i*54:(i+1)*54]
193
- else:
194
- flow_feature = np.zeros(54)
195
-
196
- combined_features = np.concatenate([keypoints, flow_feature, np.zeros(54)])
197
- features_sequence.append(combined_features)
198
 
199
  # 轉換為tensor並預測
200
- features_tensor = torch.tensor([features_sequence], dtype=torch.float32).to(device)
 
201
 
202
  with torch.no_grad():
203
- outputs = model(features_tensor)
204
  probabilities = torch.softmax(outputs, dim=1)
205
  predicted_class = torch.argmax(probabilities, dim=1).item()
206
  confidence = probabilities[0][predicted_class].item()
@@ -240,7 +435,7 @@ demo = gr.Interface(
240
  **系統特色:**
241
  - 🎯 準確率:94.25%
242
  - 📚 支援34種手語詞彙
243
- - 🧠 使用BiLSTM + 注意力機制
244
  - 👁️ MediaPipe + 光流特徵融合
245
 
246
  **使用方法:**
 
3
  import numpy as np
4
  import torch
5
  import torch.nn as nn
6
+ import torch.nn.functional as F
7
  import gradio as gr
8
  from pathlib import Path
9
  import mediapipe as mp
10
+ import json
11
 
12
  # MediaPipe設定
13
  mp_pose = mp.solutions.pose
 
22
  label_to_idx = {'again': 0, 'all': 1, 'apple': 2, 'bad': 3, 'bathroom': 4, 'beautiful': 5, 'bird': 6, 'black': 7, 'blue': 8, 'book': 9, 'bored': 10, 'boy': 11, 'brother': 12, 'brown': 13, 'but': 14, 'computer': 15, 'cousin': 16, 'dance': 17, 'day': 18, 'deaf': 19, 'doctor': 20, 'dog': 21, 'draw': 22, 'drink': 23, 'eat': 24, 'english': 25, 'family': 26, 'father': 27, 'fine': 28, 'finish': 29, 'fish': 30, 'forget': 31, 'friend': 32, 'girl': 33}
23
  idx_to_label = {v: k for k, v in label_to_idx.items()}
24
 
25
+ class SignLanguageModel(nn.Module):
26
+ """Sign Language Recognition Model"""
27
+ def __init__(self, input_dim, hidden_dim, num_layers, num_classes, dropout=0.5, flow_dim=10):
28
+ super(SignLanguageModel, self).__init__()
29
+ self.hidden_dim = hidden_dim
30
  self.num_layers = num_layers
31
+ self.num_classes = num_classes
32
 
33
+ # Keypoint feature projection
34
+ self.keypoint_projection = nn.Sequential(
35
+ nn.Linear(input_dim, hidden_dim),
36
+ nn.BatchNorm1d(hidden_dim),
37
+ nn.ReLU(),
38
+ nn.Dropout(dropout/2),
39
+ nn.Linear(hidden_dim, hidden_dim),
40
+ nn.BatchNorm1d(hidden_dim),
41
+ nn.ReLU(),
42
+ nn.Dropout(dropout/2)
43
+ )
44
 
45
+ # Flow feature projection
46
+ self.flow_projection = nn.Sequential(
47
+ nn.Linear(flow_dim, hidden_dim // 2),
48
+ nn.BatchNorm1d(hidden_dim // 2),
49
+ nn.ReLU(),
50
+ nn.Dropout(dropout/2),
51
+ nn.Linear(hidden_dim // 2, hidden_dim // 2),
52
+ nn.BatchNorm1d(hidden_dim // 2),
53
+ nn.ReLU(),
54
+ nn.Dropout(dropout/2)
55
+ )
56
 
57
+ # Feature fusion
58
+ self.fusion_layer = nn.Sequential(
59
+ nn.Linear(hidden_dim + (hidden_dim // 2), hidden_dim),
60
+ nn.BatchNorm1d(hidden_dim),
61
+ nn.ReLU(),
62
+ nn.Dropout(dropout/2)
63
+ )
64
 
65
+ # Bidirectional LSTM
66
+ self.lstm = nn.LSTM(
67
+ input_size=hidden_dim,
68
+ hidden_size=hidden_dim,
69
+ num_layers=num_layers,
70
+ batch_first=True,
71
+ dropout=dropout if num_layers > 1 else 0,
72
+ bidirectional=True
73
+ )
74
 
75
+ # GRU for additional temporal features
76
+ self.gru = nn.GRU(
77
+ input_size=hidden_dim * 2,
78
+ hidden_size=hidden_dim,
79
+ num_layers=1,
80
+ batch_first=True,
81
+ bidirectional=True
82
+ )
83
 
84
+ # Batch normalization
85
+ self.lstm_bn = nn.BatchNorm1d(hidden_dim * 2)
86
+ self.gru_bn = nn.BatchNorm1d(hidden_dim * 2)
87
 
88
+ # Multi-head attention
89
+ self.multihead_attn = nn.MultiheadAttention(
90
+ embed_dim=hidden_dim * 2,
91
+ num_heads=4,
92
+ dropout=dropout,
93
+ batch_first=True
94
+ )
95
 
96
+ # Attention mechanism
97
+ self.attention = nn.Sequential(
98
+ nn.Linear(hidden_dim * 2, hidden_dim),
99
+ nn.Tanh(),
100
+ nn.Linear(hidden_dim, 1),
101
+ nn.Softmax(dim=1)
102
+ )
103
+
104
+ # Classifier
105
+ self.classifier = nn.Sequential(
106
+ nn.Linear(hidden_dim * 4, hidden_dim * 2),
107
+ nn.BatchNorm1d(hidden_dim * 2),
108
+ nn.ReLU(),
109
+ nn.Dropout(dropout),
110
+ nn.Linear(hidden_dim * 2, hidden_dim),
111
+ nn.BatchNorm1d(hidden_dim),
112
+ nn.ReLU(),
113
+ nn.Dropout(dropout/2),
114
+ nn.Linear(hidden_dim, num_classes)
115
+ )
116
+
117
+ self._init_weights()
118
+
119
+ def _init_weights(self):
120
+ """Initialize model weights"""
121
+ for m in self.modules():
122
+ if isinstance(m, nn.Linear):
123
+ nn.init.xavier_uniform_(m.weight)
124
+ if m.bias is not None:
125
+ nn.init.zeros_(m.bias)
126
+ elif isinstance(m, (nn.LSTM, nn.GRU)):
127
+ for name, param in m.named_parameters():
128
+ if 'weight' in name:
129
+ nn.init.orthogonal_(param)
130
+ elif 'bias' in name:
131
+ nn.init.zeros_(param)
132
+
133
+ def forward(self, keypoints, flow=None):
134
+ """Forward pass"""
135
+ batch_size, seq_len, _ = keypoints.size()
136
+
137
+ # Process keypoint features
138
+ kp_reshaped = keypoints.reshape(-1, keypoints.size(-1))
139
+
140
+ # First layer
141
+ kp_projected = self.keypoint_projection[0](kp_reshaped)
142
+ kp_projected = kp_projected.reshape(batch_size, seq_len, -1)
143
+ kp_projected = kp_projected.transpose(1, 2)
144
+ kp_projected = self.keypoint_projection[1](kp_projected)
145
+ kp_projected = kp_projected.transpose(1, 2)
146
+ kp_projected = self.keypoint_projection[2](kp_projected)
147
+ kp_projected = self.keypoint_projection[3](kp_projected)
148
+
149
+ # Second layer
150
+ kp_projected_reshaped = kp_projected.reshape(-1, kp_projected.size(-1))
151
+ kp_projected = self.keypoint_projection[4](kp_projected_reshaped)
152
+ kp_projected = kp_projected.reshape(batch_size, seq_len, -1)
153
+ kp_projected = kp_projected.transpose(1, 2)
154
+ kp_projected = self.keypoint_projection[5](kp_projected)
155
+ kp_projected = kp_projected.transpose(1, 2)
156
+ kp_projected = self.keypoint_projection[6](kp_projected)
157
+ kp_projected = self.keypoint_projection[7](kp_projected)
158
+
159
+ # Process flow features if provided
160
+ if flow is not None:
161
+ flow_reshaped = flow.reshape(-1, flow.size(-1))
162
+
163
+ # First layer
164
+ flow_projected = self.flow_projection[0](flow_reshaped)
165
+ flow_projected = flow_projected.reshape(batch_size, seq_len, -1)
166
+ flow_projected = flow_projected.transpose(1, 2)
167
+ flow_projected = self.flow_projection[1](flow_projected)
168
+ flow_projected = flow_projected.transpose(1, 2)
169
+ flow_projected = self.flow_projection[2](flow_projected)
170
+ flow_projected = self.flow_projection[3](flow_projected)
171
+
172
+ # Second layer
173
+ flow_projected_reshaped = flow_projected.reshape(-1, flow_projected.size(-1))
174
+ flow_projected = self.flow_projection[4](flow_projected_reshaped)
175
+ flow_projected = flow_projected.reshape(batch_size, seq_len, -1)
176
+ flow_projected = flow_projected.transpose(1, 2)
177
+ flow_projected = self.flow_projection[5](flow_projected)
178
+ flow_projected = flow_projected.transpose(1, 2)
179
+ flow_projected = self.flow_projection[6](flow_projected)
180
+ flow_projected = self.flow_projection[7](flow_projected)
181
+
182
+ # Feature fusion
183
+ combined_features = torch.cat([kp_projected, flow_projected], dim=2)
184
+
185
+ combined_reshaped = combined_features.reshape(-1, combined_features.size(-1))
186
+ fused_features = self.fusion_layer[0](combined_reshaped)
187
+ fused_features = fused_features.reshape(batch_size, seq_len, -1)
188
+ fused_features = fused_features.transpose(1, 2)
189
+ fused_features = self.fusion_layer[1](fused_features)
190
+ fused_features = fused_features.transpose(1, 2)
191
+ fused_features = self.fusion_layer[2](fused_features)
192
+ fused_features = self.fusion_layer[3](fused_features)
193
+
194
+ x_projected = fused_features
195
+ else:
196
+ x_projected = kp_projected
197
+
198
+ # Residual connection
199
+ x_residual = x_projected
200
+
201
+ # LSTM processing
202
+ lstm_out, _ = self.lstm(x_projected)
203
+
204
+ # Residual connection
205
+ x_residual_expanded = torch.cat([x_residual, x_residual], dim=2)
206
+ lstm_out_with_residual = lstm_out + x_residual_expanded
207
+
208
+ # BatchNorm for LSTM output
209
+ lstm_out_bn = lstm_out_with_residual.transpose(1, 2)
210
+ lstm_out_bn = self.lstm_bn(lstm_out_bn)
211
+ lstm_out = lstm_out_bn.transpose(1, 2)
212
+
213
+ # GRU processing
214
+ gru_out, _ = self.gru(lstm_out)
215
+
216
+ # BatchNorm for GRU output
217
+ gru_out_bn = gru_out.transpose(1, 2)
218
+ gru_out_bn = self.gru_bn(gru_out_bn)
219
+ gru_out = gru_out_bn.transpose(1, 2)
220
+
221
+ # Multi-head attention
222
+ attn_output, _ = self.multihead_attn(lstm_out, lstm_out, lstm_out)
223
+
224
+ # Traditional attention
225
+ attention_weights = self.attention(gru_out)
226
+ context_gru = torch.bmm(gru_out.transpose(1, 2), attention_weights)
227
+ context_gru = context_gru.squeeze(-1)
228
+
229
+ attention_weights_attn = self.attention(attn_output)
230
+ context_attn = torch.bmm(attn_output.transpose(1, 2), attention_weights_attn)
231
+ context_attn = context_attn.squeeze(-1)
232
+
233
+ # Combine contexts
234
+ combined_context = torch.cat([context_gru, context_attn], dim=1)
235
+
236
+ # Final classification
237
+ output = self.classifier(combined_context)
238
 
239
  return output
240
 
241
  # 初始化模型
242
+ model = SignLanguageModel(
243
+ input_dim=225, # keypoint dimension
244
+ hidden_dim=256,
245
+ num_layers=2,
246
+ num_classes=len(label_to_idx),
247
+ dropout=0.5,
248
+ flow_dim=10
249
+ )
250
  model = model.to(device)
251
 
252
  # 載入模型權重
 
271
  """從單個frame提取關鍵點"""
272
  try:
273
  with mp_pose.Pose(static_image_mode=True, model_complexity=1) as pose, \
274
+ mp_hands.Hands(static_image_mode=True, max_num_hands=2) as hands, \
275
+ mp_face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1) as face_mesh:
276
 
277
  rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
278
 
279
  keypoints = []
280
 
281
+ # 提取姿勢關鍵點 (33個點 * 3維 = 99)
282
  pose_results = pose.process(rgb_frame)
283
  if pose_results.pose_landmarks:
284
  pose_points = []
285
  for landmark in pose_results.pose_landmarks.landmark:
286
+ pose_points.extend([landmark.x, landmark.y, landmark.z])
287
  keypoints.extend(pose_points)
288
  else:
289
+ keypoints.extend([0.0] * 99)
290
 
291
+ # 提取手部關鍵點 (21個點 * 2隻手 * 3維 = 126)
292
  hands_results = hands.process(rgb_frame)
293
  if hands_results.multi_hand_landmarks:
294
  hand_points = []
295
  for hand_landmarks in hands_results.multi_hand_landmarks:
296
  for landmark in hand_landmarks.landmark:
297
+ hand_points.extend([landmark.x, landmark.y, landmark.z])
298
+
299
+ # 確保有126個手部關鍵點 (2隻手)
300
+ if len(hand_points) >= 126:
301
+ keypoints.extend(hand_points[:126])
302
  else:
303
+ keypoints.extend(hand_points + [0.0] * (126 - len(hand_points)))
304
  else:
305
+ keypoints.extend([0.0] * 126)
306
+
307
+ # 如果需要,確保總共225個特徵
308
+ while len(keypoints) < 225:
309
+ keypoints.append(0.0)
310
 
311
+ return np.array(keypoints[:225], dtype=np.float32)
312
  except Exception as e:
313
  print(f"關鍵點提取錯誤: {e}")
314
+ return np.zeros(225, dtype=np.float32)
315
 
316
  def calculate_optical_flow_features(frames):
317
  """計算光流特徵"""
318
  try:
319
  if len(frames) < 2:
320
+ return np.zeros(10, dtype=np.float32)
321
 
322
  flow_features = []
323
+ for i in range(min(len(frames) - 1, 10)): # 最多計算10個光流
324
  gray1 = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
325
  gray2 = cv2.cvtColor(frames[i + 1], cv2.COLOR_BGR2GRAY)
326
 
327
+ # 計算光流
328
  flow = cv2.calcOpticalFlowPyrLK(
329
  gray1, gray2, None, None,
330
  winSize=(15, 15),
 
333
  )
334
 
335
  if flow[0] is not None and len(flow[0]) > 0:
336
+ # 計算光流的平均大小
337
+ flow_magnitude = np.mean(np.sqrt(flow[0].flatten()**2))
338
+ flow_features.append(flow_magnitude)
339
  else:
340
+ flow_features.append(0.0)
341
 
342
+ # 確保有10個光流特徵
343
+ while len(flow_features) < 10:
344
+ flow_features.append(0.0)
345
+
346
+ return np.array(flow_features[:10], dtype=np.float32)
347
  except Exception as e:
348
  print(f"光流計算錯誤: {e}")
349
+ return np.zeros(10, dtype=np.float32)
350
 
351
  def predict_sign_language(video_path):
352
  """預測手語影片"""
 
373
 
374
  optical_flow = calculate_optical_flow_features(frames)
375
 
376
+ # 確保序列長度為50 (與訓練時一致)
377
+ target_length = 50
378
  if len(keypoints_sequence) > target_length:
379
+ # 均勻採樣
380
+ indices = np.linspace(0, len(keypoints_sequence)-1, target_length, dtype=int)
381
+ keypoints_sequence = [keypoints_sequence[i] for i in indices]
382
  elif len(keypoints_sequence) < target_length:
383
+ # 重複最後一幀
384
+ last_frame = keypoints_sequence[-1] if keypoints_sequence else np.zeros(225)
385
  while len(keypoints_sequence) < target_length:
386
  keypoints_sequence.append(last_frame)
387
 
388
+ # 為每個時間步創建光流特徵
389
+ flow_sequence = []
390
+ for i in range(target_length):
391
+ flow_sequence.append(optical_flow)
 
 
 
 
 
 
392
 
393
  # 轉換為tensor並預測
394
+ keypoints_tensor = torch.tensor([keypoints_sequence], dtype=torch.float32).to(device)
395
+ flow_tensor = torch.tensor([flow_sequence], dtype=torch.float32).to(device)
396
 
397
  with torch.no_grad():
398
+ outputs = model(keypoints_tensor, flow_tensor)
399
  probabilities = torch.softmax(outputs, dim=1)
400
  predicted_class = torch.argmax(probabilities, dim=1).item()
401
  confidence = probabilities[0][predicted_class].item()
 
435
  **系統特色:**
436
  - 🎯 準確率:94.25%
437
  - 📚 支援34種手語詞彙
438
+ - 🧠 使用BiLSTM + GRU + 多頭注意力機制
439
  - 👁️ MediaPipe + 光流特徵融合
440
 
441
  **使用方法:**
requirements.txt CHANGED
@@ -5,4 +5,5 @@ opencv-python>=4.8.0
5
  mediapipe>=0.10.0
6
  numpy>=1.24.0
7
  Pillow>=9.5.0
8
- scipy>=1.10.0
 
 
5
  mediapipe>=0.10.0
6
  numpy>=1.24.0
7
  Pillow>=9.5.0
8
+ scipy>=1.10.0
9
+ pydantic==2.10.6