fabiosam commited on
Commit
241790d
verified
1 Parent(s): 8ea163f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -158
app.py CHANGED
@@ -1,87 +1,101 @@
1
  import os
2
  import json
3
- import cv2
4
  import numpy as np
 
 
5
  import mediapipe as mp
6
  import tensorflow as tf
7
  from tensorflow import keras
8
 
9
- import gradio as gr
10
- import matplotlib.pyplot as plt
 
 
 
 
11
 
12
- print("TensorFlow version:", tf.__version__)
13
 
14
- # ==== RUTAS DEL MODELO ====
15
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
16
- MODELS_DIR = os.path.join(BASE_DIR, "models")
 
 
 
 
 
 
17
 
18
- MODEL_PATH = os.path.join(MODELS_DIR, "sign_model_lstm_v1.keras")
19
- LABELS_PATH = os.path.join(MODELS_DIR, "label_names.json")
 
 
 
 
20
 
21
- print("Cargando modelo desde:", MODEL_PATH)
22
- model = keras.models.load_model(MODEL_PATH)
 
 
 
 
23
 
24
- with open(LABELS_PATH, "r") as f:
25
- label_names = json.load(f)
 
 
 
 
26
 
27
- num_classes = len(label_names)
28
- MAX_FRAMES = 20 # mismo valor que usaste al entrenar
29
 
30
 
31
- # ==== MEDIAPIPE ====
32
- mp_holistic = mp.solutions.holistic
33
- mp_drawing = mp.solutions.drawing_utils
34
- mp_styles = mp.solutions.drawing_styles
 
 
 
 
 
 
 
 
 
 
35
 
36
 
37
- def extract_landmarks_from_results(results):
38
- """
39
- Convierte los resultados de MediaPipe Holistic en un vector 1D (225,)
40
- con pose (33), mano izq (21) y mano der (21).
41
- Cada punto = (x, y, z) => 75 * 3 = 225 features.
42
- """
43
- def get_xyz(landmarks, n_points):
44
- if landmarks is None:
45
- data = [[0.0, 0.0, 0.0]] * n_points
46
- else:
47
- data = [[lm.x, lm.y, lm.z] for lm in landmarks]
48
- if len(data) < n_points:
49
- data += [[0.0, 0.0, 0.0]] * (n_points - len(data))
50
- data = data[:n_points]
51
- return data
52
 
53
- pose = get_xyz(results.pose_landmarks.landmark if results.pose_landmarks else None, 33)
54
- left_hand = get_xyz(results.left_hand_landmarks.landmark if results.left_hand_landmarks else None, 21)
55
- right_hand = get_xyz(results.right_hand_landmarks.landmark if results.right_hand_landmarks else None, 21)
56
 
57
- all_points = pose + left_hand + right_hand
58
- return np.array(all_points, dtype=np.float32).flatten() # (225,)
59
 
 
 
 
 
60
 
61
- def video_to_sequence_and_annotated(video_path, max_frames=MAX_FRAMES):
62
- """
63
- Procesa un video:
64
- - Devuelve la secuencia (1, max_frames, 225) para el LSTM
65
- - Devuelve la ruta de un nuevo video con los landmarks dibujados.
66
- """
67
- cap = cv2.VideoCapture(video_path)
68
 
69
- if not cap.isOpened():
70
- raise ValueError(f"No se pudo abrir el video: {video_path}")
71
 
72
- frames_feats = []
73
 
74
- # Info del video de entrada
75
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
76
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
77
- fps = cap.get(cv2.CAP_PROP_FPS)
78
- if fps is None or fps <= 0:
79
- fps = 25.0
80
 
81
- # Ruta temporal para el video anotado
82
- out_path = os.path.join("/tmp", "annotated_output.mp4")
83
- fourcc = cv2.VideoWriter_fourcc(*"mp4v")
84
- writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
85
 
86
  with mp_holistic.Holistic(
87
  static_image_mode=False,
@@ -90,128 +104,61 @@ def video_to_sequence_and_annotated(video_path, max_frames=MAX_FRAMES):
90
  refine_face_landmarks=False,
91
  min_detection_confidence=0.5,
92
  min_tracking_confidence=0.5
93
- ) as holistic:
94
 
95
- frame_idx = 0
96
  while True:
97
  ret, frame = cap.read()
98
  if not ret:
99
  break
100
 
101
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
102
- results = holistic.process(frame_rgb)
103
-
104
- # Features para el modelo (solo guardamos hasta max_frames)
105
- if len(frames_feats) < max_frames:
106
- vec = extract_landmarks_from_results(results)
107
- frames_feats.append(vec)
108
-
109
- # DIBUJAR LANDMARKS SOBRE EL FRAME
110
- annotated = frame.copy()
111
-
112
- if results.pose_landmarks:
113
- mp_drawing.draw_landmarks(
114
- annotated,
115
- results.pose_landmarks,
116
- mp_holistic.POSE_CONNECTIONS,
117
- landmark_drawing_spec=mp_styles.get_default_pose_landmarks_style()
118
- )
119
- if results.left_hand_landmarks:
120
- mp_drawing.draw_landmarks(
121
- annotated,
122
- results.left_hand_landmarks,
123
- mp_holistic.HAND_CONNECTIONS,
124
- landmark_drawing_spec=mp_styles.get_default_hand_landmarks_style()
125
- )
126
- if results.right_hand_landmarks:
127
- mp_drawing.draw_landmarks(
128
- annotated,
129
- results.right_hand_landmarks,
130
- mp_holistic.HAND_CONNECTIONS,
131
- landmark_drawing_spec=mp_styles.get_default_hand_landmarks_style()
132
- )
133
-
134
- writer.write(annotated)
135
- frame_idx += 1
136
 
137
  cap.release()
138
- writer.release()
139
 
140
- if len(frames_feats) == 0:
141
- raise ValueError("No se pudieron leer frames v谩lidos del video.")
142
-
143
- seq = np.array(frames_feats, dtype=np.float32)
144
-
145
- # Padding / recorte para el LSTM
146
- if seq.shape[0] < max_frames:
147
- pad_len = max_frames - seq.shape[0]
148
- pad = np.zeros((pad_len, seq.shape[1]), dtype=np.float32)
149
- seq = np.concatenate([seq, pad], axis=0)
150
  else:
151
- seq = seq[:max_frames, :]
 
152
 
153
- seq = seq.reshape(1, max_frames, seq.shape[1]) # (1, T, 225)
154
 
155
- return seq, out_path
 
156
 
 
157
 
158
- def predict_from_video(video):
159
- """
160
- Funci贸n llamada por Gradio.
161
- - Recibe ruta del video (upload o webcam).
162
- - Devuelve: texto, gr谩fico de barras, video con landmarks.
163
- """
164
- if video is None:
165
- return "Sube un video o gr谩balo primero.", None, None
166
-
167
- # Gradio a veces pasa dict con 'video'
168
- if isinstance(video, dict) and "video" in video:
169
- video_path = video["video"]
170
- else:
171
- video_path = video
172
-
173
- seq, annotated_path = video_to_sequence_and_annotated(video_path, MAX_FRAMES)
174
 
175
- probs = model.predict(seq, verbose=0)[0] # (num_classes,)
 
 
 
 
 
176
  idx = int(np.argmax(probs))
177
  label = label_names[idx]
178
  conf = float(probs[idx])
179
 
180
- # Texto
181
- text_pred = f"Predicci贸n: {label} (confianza {conf:.2f})"
182
-
183
- # Gr谩fico de barras
184
- fig, ax = plt.subplots(figsize=(6, 3))
185
- ax.bar(range(len(label_names)), probs)
186
- ax.set_xticks(range(len(label_names)))
187
- ax.set_xticklabels(label_names, rotation=45, ha="right")
188
- ax.set_ylim(0, 1)
189
- ax.set_ylabel("Confianza")
190
- ax.set_title("Confianza por clase")
191
- plt.tight_layout()
192
-
193
- # annotated_path es la ruta del video con landmarks
194
- return text_pred, fig, annotated_path
195
-
196
-
197
- title = "LSP-EnSe帽as - Demo LSTM"
198
- description = (
199
- "Traductor de se帽as basado en LSTM + MediaPipe Holistic. "
200
- "Sube un video corto o gr谩balo en vivo haciendo una se帽a. "
201
- "El modelo procesar谩 el movimiento (cuerpo + manos), mostrar谩 la predicci贸n "
202
- "y devolver谩 tu video con los puntos (landmarks) dibujados."
203
- )
204
 
 
 
 
205
  demo = gr.Interface(
206
- fn=predict_from_video,
207
- inputs=gr.Video(label="Sube un video o gr谩balo desde la c谩mara"),
208
  outputs=[
209
- gr.Textbox(label="Predicci贸n del modelo"),
210
- gr.Plot(label="Confianza por clase"),
211
- gr.Video(label="Video con landmarks detectados"),
212
  ],
213
- title=title,
214
- description=description,
215
  )
216
 
217
  if __name__ == "__main__":
 
1
  import os
2
  import json
 
3
  import numpy as np
4
+ import cv2
5
+ import gradio as gr
6
  import mediapipe as mp
7
  import tensorflow as tf
8
  from tensorflow import keras
9
 
10
+ # ---------------------------------------------------------
11
+ # CONFIG
12
+ # ---------------------------------------------------------
13
+ MODELS_DIR = "models"
14
+ MAX_FRAMES = 20
15
+ N_FEATURES = 225 # 75 landmarks * (x,y,z)
16
 
17
+ mp_holistic = mp.solutions.holistic
18
 
19
+ # ---------------------------------------------------------
20
+ # LANDMARK EXTRACTION (MISMO QUE EN TRAIN)
21
+ # ---------------------------------------------------------
22
+ def extract_landmarks_from_results(results):
23
+ """
24
+ Devuelve un vector plano de 225 floats (75 puntos * 3 coords).
25
+ """
26
+ # Order: left hand (21), right hand (21), pose (33)
27
+ final = []
28
 
29
+ # LEFT HAND
30
+ if results.left_hand_landmarks:
31
+ for lm in results.left_hand_landmarks.landmark:
32
+ final.extend([lm.x, lm.y, lm.z])
33
+ else:
34
+ final.extend([0.0] * 63)
35
 
36
+ # RIGHT HAND
37
+ if results.right_hand_landmarks:
38
+ for lm in results.right_hand_landmarks.landmark:
39
+ final.extend([lm.x, lm.y, lm.z])
40
+ else:
41
+ final.extend([0.0] * 63)
42
 
43
+ # POSE (33 puntos)
44
+ if results.pose_landmarks:
45
+ for lm in results.pose_landmarks.landmark:
46
+ final.extend([lm.x, lm.y, lm.z])
47
+ else:
48
+ final.extend([0.0] * 99)
49
 
50
+ return np.array(final, dtype=np.float32)
 
51
 
52
 
53
+ # ---------------------------------------------------------
54
+ # PAD/TRUNCATE EXACTO AL DEL NOTEBOOK
55
+ # ---------------------------------------------------------
56
+ def pad_or_truncate(seq, max_frames=MAX_FRAMES):
57
+ T = seq.shape[0]
58
+ if T == max_frames:
59
+ return seq
60
+ elif T > max_frames:
61
+ start = (T - max_frames) // 2
62
+ return seq[start:start + max_frames]
63
+ else:
64
+ pad_len = max_frames - T
65
+ pad = np.zeros((pad_len, seq.shape[1]), dtype=np.float32)
66
+ return np.concatenate([seq, pad], axis=0)
67
 
68
 
69
+ # ---------------------------------------------------------
70
+ # CARGAR MODELO + LABELS + NORMALIZACI脫N
71
+ # ---------------------------------------------------------
72
+ def load_model():
73
+ model_path = os.path.join(MODELS_DIR, "sign_model_lstm_v1.keras")
74
+ labels_path = os.path.join(MODELS_DIR, "label_names.json")
75
+ mean_path = os.path.join(MODELS_DIR, "feature_mean.npy")
76
+ std_path = os.path.join(MODELS_DIR, "feature_std.npy")
 
 
 
 
 
 
 
77
 
78
+ model = keras.models.load_model(model_path)
 
 
79
 
80
+ with open(labels_path, "r") as f:
81
+ label_names = json.load(f)
82
 
83
+ feature_mean = np.load(mean_path)
84
+ feature_std = np.load(std_path)
85
+
86
+ return model, label_names, feature_mean, feature_std
87
 
 
 
 
 
 
 
 
88
 
89
+ model, label_names, feature_mean, feature_std = load_model()
 
90
 
 
91
 
92
+ # ---------------------------------------------------------
93
+ # PROCESAR VIDEO (MISMO QUE EN TRAIN)
94
+ # ---------------------------------------------------------
95
+ def process_video(video_file):
 
 
96
 
97
+ cap = cv2.VideoCapture(video_file)
98
+ frames = []
 
 
99
 
100
  with mp_holistic.Holistic(
101
  static_image_mode=False,
 
104
  refine_face_landmarks=False,
105
  min_detection_confidence=0.5,
106
  min_tracking_confidence=0.5
107
+ ) as holis:
108
 
 
109
  while True:
110
  ret, frame = cap.read()
111
  if not ret:
112
  break
113
 
114
+ rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
115
+ results = holis.process(rgb)
116
+ feats = extract_landmarks_from_results(results)
117
+ frames.append(feats)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  cap.release()
 
120
 
121
+ if len(frames) == 0:
122
+ seq = np.zeros((MAX_FRAMES, N_FEATURES), dtype=np.float32)
 
 
 
 
 
 
 
 
123
  else:
124
+ seq_full = np.stack(frames, axis=0)
125
+ seq = pad_or_truncate(seq_full)
126
 
127
+ seq = seq[np.newaxis, ...] # (1, T, 225)
128
 
129
+ # NORMALIZACI脫N IGUAL
130
+ seq_norm = (seq - feature_mean) / feature_std
131
 
132
+ return seq_norm
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ # ---------------------------------------------------------
136
+ # PREDICCI脫N FINAL
137
+ # ---------------------------------------------------------
138
+ def predict(video):
139
+ seq = process_video(video)
140
+ probs = model.predict(seq, verbose=0)[0]
141
  idx = int(np.argmax(probs))
142
  label = label_names[idx]
143
  conf = float(probs[idx])
144
 
145
+ # Formato bonito para Gradio
146
+ probs_dict = {label_names[i]: float(probs[i]) for i in range(len(label_names))}
147
+
148
+ return f"Predicci贸n: {label} (confianza {conf:.2f})", probs_dict
149
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
+ # ---------------------------------------------------------
152
+ # UI GRADIO
153
+ # ---------------------------------------------------------
154
  demo = gr.Interface(
155
+ fn=predict,
156
+ inputs=gr.Video(label="Sube un video haciendo la se帽a"),
157
  outputs=[
158
+ gr.Textbox(label="Resultado"),
159
+ gr.Label(label="Probabilidades por clase")
 
160
  ],
161
+ title="Sign Language Translator - LSTM"
 
162
  )
163
 
164
  if __name__ == "__main__":