fabiosam commited on
Commit
676c99a
·
verified ·
1 Parent(s): f9d7cbe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -58
app.py CHANGED
@@ -1,51 +1,80 @@
1
- import gradio as gr
 
 
 
 
2
  import numpy as np
3
  import mediapipe as mp
4
- import cv2
5
  from tensorflow import keras
6
- import json
7
- import os
8
 
9
- # === LOAD MODEL & LABELS ===
10
- MODEL_PATH = "models/sign_model_lstm_v1.keras"
11
- LABELS_PATH = "models/label_names.json"
12
 
13
- print(f"TensorFlow version: {keras.__version__}")
14
- print("Cargando modelo desde:", MODEL_PATH)
 
 
 
15
 
 
 
 
 
16
  model = keras.models.load_model(MODEL_PATH)
17
 
18
  with open(LABELS_PATH, "r") as f:
19
  label_names = json.load(f)
20
 
21
- NUM_POINTS = (33 + 21 + 21) * 3 # pose + left + right = (33 + 21 + 21) landmarks, xyz values
 
22
 
 
23
  mp_holistic = mp.solutions.holistic
 
 
 
24
 
 
25
  def extract_landmarks_from_results(results):
26
- def get_xyz(landmarks, n):
 
 
 
 
 
27
  if landmarks is None:
28
- data = [[0,0,0]] * n
29
  else:
30
  data = [[lm.x, lm.y, lm.z] for lm in landmarks]
31
- if len(data) < n:
32
- data += [[0,0,0]] * (n - len(data))
33
- data = data[:n]
34
  return data
35
 
36
  pose = get_xyz(results.pose_landmarks.landmark if results.pose_landmarks else None, 33)
37
- left = get_xyz(results.left_hand_landmarks.landmark if results.left_hand_landmarks else None, 21)
38
- right = get_xyz(results.right_hand_landmarks.landmark if results.right_hand_landmarks else None, 21)
39
 
40
- all_points = pose + left + right
41
- return np.array(all_points, dtype=np.float32).flatten()
42
 
43
- def predict_video_lstm(video_path):
44
- print("Procesando video:", video_path)
45
 
 
 
 
 
 
 
 
 
46
  cap = cv2.VideoCapture(video_path)
47
- frames_landmarks = []
48
-
 
 
 
49
  with mp_holistic.Holistic(
50
  static_image_mode=False,
51
  model_complexity=1,
@@ -60,54 +89,130 @@ def predict_video_lstm(video_path):
60
  if not ret:
61
  break
62
 
63
- image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
64
- results = holistic.process(image_rgb)
65
 
66
  vec = extract_landmarks_from_results(results)
67
- frames_landmarks.append(vec)
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  cap.release()
70
 
71
- if len(frames_landmarks) == 0:
72
- return "No se encontraron landmarks", {}
73
 
74
- # Pad / slice to fixed length (20 frames)
75
- MAX_FRAMES = 20
76
- if len(frames_landmarks) < MAX_FRAMES:
77
- pad = [np.zeros(NUM_POINTS)] * (MAX_FRAMES - len(frames_landmarks))
78
- frames_landmarks = frames_landmarks + pad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  else:
80
- frames_landmarks = frames_landmarks[:MAX_FRAMES]
81
 
82
- X = np.array(frames_landmarks, dtype=np.float32).reshape(1, MAX_FRAMES, NUM_POINTS)
83
 
84
- probs = model.predict(X, verbose=0)[0]
85
  idx = int(np.argmax(probs))
86
-
87
- prediction = label_names[idx]
88
- confidences = {label_names[i]: float(probs[i]) for i in range(len(probs))}
89
-
90
- return f"Predicción: {prediction}", confidences
91
-
92
- # === GRADIO UI ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  demo = gr.Interface(
95
- fn=predict_video_lstm,
96
- inputs=gr.Video(
97
- sources=["upload", "webcam"],
98
- label="Sube un video o graba tu seña",
99
- format="mp4"
100
- ),
101
  outputs=[
102
- gr.Textbox(label="Resultado"),
103
- gr.Label(label="Confianza por clase")
 
104
  ],
105
- title="LSP-EnSeñas - Demo LSTM",
106
- description=(
107
- "Traductor de señas basado en LSTM + MediaPipe Holistic.\n"
108
- "Sube un video corto o grábalo en vivo haciendo una seña.\n"
109
- "El modelo procesará el movimiento (cuerpo + manos) y mostrará la predicción."
110
- )
111
  )
112
 
113
- demo.launch()
 
 
1
+ import os
2
+ import json
3
+ import io
4
+
5
+ import cv2
6
  import numpy as np
7
  import mediapipe as mp
8
+ import tensorflow as tf
9
  from tensorflow import keras
 
 
10
 
11
+ import gradio as gr
12
+ import matplotlib.pyplot as plt
13
+ from PIL import Image
14
 
15
+ print("TensorFlow version:", tf.__version__)
16
+
17
+ # ==== RUTAS DEL MODELO ====
18
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
19
+ MODELS_DIR = os.path.join(BASE_DIR, "models")
20
 
21
+ MODEL_PATH = os.path.join(MODELS_DIR, "sign_model_lstm_v1.keras")
22
+ LABELS_PATH = os.path.join(MODELS_DIR, "label_names.json")
23
+
24
+ print("Cargando modelo desde:", MODEL_PATH)
25
  model = keras.models.load_model(MODEL_PATH)
26
 
27
  with open(LABELS_PATH, "r") as f:
28
  label_names = json.load(f)
29
 
30
+ num_classes = len(label_names)
31
+ MAX_FRAMES = 20 # mismo valor que usaste al entrenar
32
 
33
+ # ==== MEDIAPIPE ====
34
  mp_holistic = mp.solutions.holistic
35
+ mp_drawing = mp.solutions.drawing_utils
36
+ mp_styles = mp.solutions.drawing_styles
37
+
38
 
39
+ # ---- 1. EXTRAER LANDMARKS COMO VECTOR (IGUAL QUE EN EL ENTRENAMIENTO) ----
40
  def extract_landmarks_from_results(results):
41
+ """
42
+ Convierte los resultados de MediaPipe Holistic en un vector 1D (225,)
43
+ con pose (33), mano izq (21) y mano der (21).
44
+ Cada punto = (x, y, z) => 75 * 3 = 225 features.
45
+ """
46
+ def get_xyz(landmarks, n_points):
47
  if landmarks is None:
48
+ data = [[0.0, 0.0, 0.0]] * n_points
49
  else:
50
  data = [[lm.x, lm.y, lm.z] for lm in landmarks]
51
+ if len(data) < n_points:
52
+ data += [[0.0, 0.0, 0.0]] * (n_points - len(data))
53
+ data = data[:n_points]
54
  return data
55
 
56
  pose = get_xyz(results.pose_landmarks.landmark if results.pose_landmarks else None, 33)
57
+ left_hand = get_xyz(results.left_hand_landmarks.landmark if results.left_hand_landmarks else None, 21)
58
+ right_hand = get_xyz(results.right_hand_landmarks.landmark if results.right_hand_landmarks else None, 21)
59
 
60
+ all_points = pose + left_hand + right_hand
61
+ return np.array(all_points, dtype=np.float32).flatten() # (225,)
62
 
 
 
63
 
64
+ # ---- 2. PROCESAR VIDEO -> SECUENCIA + FRAME CON LANDMARKS ----
65
+ def video_to_sequence_and_landmarks_frame(video_path, max_frames=MAX_FRAMES):
66
+ """
67
+ Procesa un video:
68
+ - Devuelve la secuencia (1, max_frames, 225) para el LSTM
69
+ - Devuelve una imagen (PIL.Image) con los landmarks dibujados
70
+ en el primer frame donde se detecte algo.
71
+ """
72
  cap = cv2.VideoCapture(video_path)
73
+
74
+ frames_feats = []
75
+ frame_for_vis = None
76
+ results_for_vis = None
77
+
78
  with mp_holistic.Holistic(
79
  static_image_mode=False,
80
  model_complexity=1,
 
89
  if not ret:
90
  break
91
 
92
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
93
+ results = holistic.process(frame_rgb)
94
 
95
  vec = extract_landmarks_from_results(results)
96
+ frames_feats.append(vec)
97
+
98
+ # Guardamos el primer frame donde se detecta algo para visualizar
99
+ if frame_for_vis is None and (
100
+ results.pose_landmarks or
101
+ results.left_hand_landmarks or
102
+ results.right_hand_landmarks
103
+ ):
104
+ frame_for_vis = frame.copy()
105
+ results_for_vis = results
106
+
107
+ if len(frames_feats) >= max_frames:
108
+ break
109
 
110
  cap.release()
111
 
112
+ if len(frames_feats) == 0:
113
+ raise ValueError("No se pudieron leer frames válidos del video.")
114
 
115
+ seq = np.array(frames_feats, dtype=np.float32)
116
+
117
+ # Padding / recorte
118
+ if seq.shape[0] < max_frames:
119
+ pad_len = max_frames - seq.shape[0]
120
+ pad = np.zeros((pad_len, seq.shape[1]), dtype=np.float32)
121
+ seq = np.concatenate([seq, pad], axis=0)
122
+ else:
123
+ seq = seq[:max_frames, :]
124
+
125
+ seq = seq.reshape(1, max_frames, seq.shape[1]) # (1, T, 225)
126
+
127
+ # --- crear imagen con landmarks ---
128
+ landmarks_image = None
129
+ if frame_for_vis is not None and results_for_vis is not None:
130
+ annotated = frame_for_vis.copy()
131
+
132
+ mp_drawing.draw_landmarks(
133
+ annotated,
134
+ results_for_vis.pose_landmarks,
135
+ mp_holistic.POSE_CONNECTIONS,
136
+ landmark_drawing_spec=mp_styles.get_default_pose_landmarks_style()
137
+ )
138
+ mp_drawing.draw_landmarks(
139
+ annotated,
140
+ results_for_vis.left_hand_landmarks,
141
+ mp_holistic.HAND_CONNECTIONS,
142
+ landmark_drawing_spec=mp_styles.get_default_hand_landmarks_style()
143
+ )
144
+ mp_drawing.draw_landmarks(
145
+ annotated,
146
+ results_for_vis.right_hand_landmarks,
147
+ mp_holistic.HAND_CONNECTIONS,
148
+ landmark_drawing_spec=mp_styles.get_default_hand_landmarks_style()
149
+ )
150
+
151
+ annotated_rgb = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB)
152
+ landmarks_image = Image.fromarray(annotated_rgb)
153
+
154
+ return seq, landmarks_image
155
+
156
+
157
+ # ---- 3. PREDICCIÓN + GRÁFICO ----
158
+ def predict_from_video(video):
159
+ """
160
+ Función que usa Gradio:
161
+ - recibe ruta del video (upload o webcam)
162
+ - devuelve: texto con predicción, plot de barras, imagen con landmarks
163
+ """
164
+ if video is None:
165
+ return "Sube un video o grábalo primero.", None, None
166
+
167
+ # Gradio pasa un dict con la ruta en 'video'
168
+ if isinstance(video, dict) and "video" in video:
169
+ video_path = video["video"]
170
  else:
171
+ video_path = video
172
 
173
+ seq, landmarks_image = video_to_sequence_and_landmarks_frame(video_path, MAX_FRAMES)
174
 
175
+ probs = model.predict(seq, verbose=0)[0] # (num_classes,)
176
  idx = int(np.argmax(probs))
177
+ label = label_names[idx]
178
+ conf = float(probs[idx])
179
+
180
+ # Texto de salida
181
+ text_pred = f"Predicción: {label} (confianza {conf:.2f})"
182
+
183
+ # Gráfico de barras con las probabilidades
184
+ fig, ax = plt.subplots(figsize=(6, 3))
185
+ ax.bar(range(len(label_names)), probs)
186
+ ax.set_xticks(range(len(label_names)))
187
+ ax.set_xticklabels(label_names, rotation=45, ha="right")
188
+ ax.set_ylim(0, 1)
189
+ ax.set_ylabel("Confianza")
190
+ ax.set_title("Confianza por clase")
191
+ plt.tight_layout()
192
+
193
+ return text_pred, fig, landmarks_image
194
+
195
+
196
+ # ---- 4. INTERFAZ GRADIO ----
197
+ title = "LSP-EnSeñas - Demo LSTM"
198
+ description = (
199
+ "Traductor de señas basado en LSTM + MediaPipe Holistic. "
200
+ "Sube un video corto o grábalo en vivo haciendo una seña. "
201
+ "El modelo procesará el movimiento (cuerpo + manos) y mostrará la predicción. "
202
+ "También verás un frame con los puntos (landmarks) detectados por MediaPipe."
203
+ )
204
 
205
  demo = gr.Interface(
206
+ fn=predict_from_video,
207
+ inputs=gr.Video(label="Sube un video o grábalo desde la cámara"),
 
 
 
 
208
  outputs=[
209
+ gr.Textbox(label="Predicción del modelo"),
210
+ gr.Plot(label="Confianza por clase"),
211
+ gr.Image(type="pil", label="Landmarks detectados (ejemplo de frame)")
212
  ],
213
+ title=title,
214
+ description=description,
 
 
 
 
215
  )
216
 
217
+ if __name__ == "__main__":
218
+ demo.launch()