fabiosam commited on
Commit
f9d7cbe
·
verified ·
1 Parent(s): 56da706

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -107
app.py CHANGED
@@ -1,77 +1,51 @@
1
- import os
2
- import json
3
- import cv2
4
  import numpy as np
5
  import mediapipe as mp
6
- import tensorflow as tf
7
- import gradio as gr
8
-
9
- print("TensorFlow version:", tf.__version__)
10
-
11
- # ==== RUTAS DEL MODELO ====
12
- BASE_DIR = os.path.dirname(__file__)
13
- MODELS_DIR = os.path.join(BASE_DIR, "models")
14
 
15
- MODEL_PATH = os.path.join(MODELS_DIR, "sign_model_lstm_v1.keras")
16
- LABELS_PATH = os.path.join(MODELS_DIR, "label_names.json")
 
17
 
 
18
  print("Cargando modelo desde:", MODEL_PATH)
19
- model = tf.keras.models.load_model(MODEL_PATH)
 
20
 
21
  with open(LABELS_PATH, "r") as f:
22
  label_names = json.load(f)
23
 
24
- mp_holistic = mp.solutions.holistic
25
-
26
- MAX_FRAMES = 20 # mismo valor que usaste al entrenar
27
- N_FEATURES = 225 # 75 puntos * 3 coords (x, y, z)
28
-
29
 
30
- # ========= FUNCIONES DE PROCESADO =========
31
 
32
  def extract_landmarks_from_results(results):
33
- """
34
- Convierte los resultados de MediaPipe Holistic en un vector 1D (225,)
35
- con pose (33), mano izq (21) y mano der (21).
36
- """
37
- def get_xyz(landmarks, n_points):
38
  if landmarks is None:
39
- data = [[0.0, 0.0, 0.0]] * n_points
40
  else:
41
  data = [[lm.x, lm.y, lm.z] for lm in landmarks]
42
- if len(data) < n_points:
43
- data += [[0.0, 0.0, 0.0]] * (n_points - len(data))
44
- data = data[:n_points]
45
  return data
46
 
47
- pose = get_xyz(
48
- results.pose_landmarks.landmark if results.pose_landmarks else None,
49
- 33
50
- )
51
- left_hand = get_xyz(
52
- results.left_hand_landmarks.landmark if results.left_hand_landmarks else None,
53
- 21
54
- )
55
- right_hand = get_xyz(
56
- results.right_hand_landmarks.landmark if results.right_hand_landmarks else None,
57
- 21
58
- )
59
 
60
- all_points = pose + left_hand + right_hand
61
- return np.array(all_points, dtype=np.float32).flatten() # (225,)
62
 
63
-
64
- def preprocess_video_to_sequence(video_path, max_frames=MAX_FRAMES):
65
- """
66
- Lee un video, extrae landmarks por frame y devuelve
67
- una secuencia (1, max_frames, 225) lista para el LSTM.
68
- """
69
- if video_path is None:
70
- raise ValueError("No se recibió ruta de video.")
71
 
72
  cap = cv2.VideoCapture(video_path)
73
- frames_feats = []
74
-
75
  with mp_holistic.Holistic(
76
  static_image_mode=False,
77
  model_complexity=1,
@@ -86,80 +60,54 @@ def preprocess_video_to_sequence(video_path, max_frames=MAX_FRAMES):
86
  if not ret:
87
  break
88
 
89
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
90
- results = holistic.process(frame_rgb)
91
-
92
- vec = extract_landmarks_from_results(results) # (225,)
93
- frames_feats.append(vec)
94
 
95
- if len(frames_feats) >= max_frames:
96
- break
97
 
98
  cap.release()
99
 
100
- if len(frames_feats) == 0:
101
- raise ValueError("El video no tiene frames válidos para procesar.")
102
-
103
- seq = np.array(frames_feats, dtype=np.float32)
104
 
105
- # padding / recorte
106
- if seq.shape[0] < max_frames:
107
- pad_len = max_frames - seq.shape[0]
108
- pad = np.zeros((pad_len, seq.shape[1]), dtype=np.float32)
109
- seq = np.concatenate([seq, pad], axis=0)
110
  else:
111
- seq = seq[:max_frames, :]
112
 
113
- seq = seq.reshape(1, max_frames, seq.shape[1]) # (1, T, 225)
114
- return seq
115
-
116
-
117
- def predict_video_lstm(video_path):
118
- """
119
- Función que usa Gradio:
120
- - Recibe la ruta de un video
121
- - Devuelve la predicción principal + distribución de probabilidades
122
- """
123
- if video_path is None:
124
- return "Sube o graba un video primero.", {}
125
-
126
- try:
127
- seq = preprocess_video_to_sequence(video_path, max_frames=MAX_FRAMES)
128
- probs = model.predict(seq, verbose=0)[0] # (num_classes,)
129
- except Exception as e:
130
- return f"Error procesando el video: {e}", {}
131
 
 
132
  idx = int(np.argmax(probs))
133
- label = label_names[idx]
134
- conf = float(probs[idx])
135
-
136
- prob_dict = {name: float(probs[i]) for i, name in enumerate(label_names)}
137
 
138
- texto = f"Predicción: {label} (confianza {conf:.2f})"
139
- return texto, prob_dict
140
 
141
-
142
- # ========= INTERFAZ GRADIO =========
143
 
144
  demo = gr.Interface(
145
  fn=predict_video_lstm,
146
  inputs=gr.Video(
147
- sources=["upload", "webcam"], # 👈 AQUÍ está la diferencia: sin 'source'
148
- label="Video de seña (sube o graba)",
149
- format="mp4",
150
- type="filepath" # Gradio le pasa a la función la ruta del archivo
151
  ),
152
  outputs=[
153
  gr.Textbox(label="Resultado"),
154
- gr.Label(label="Probabilidades por clase")
155
  ],
156
  title="LSP-EnSeñas - Demo LSTM",
157
  description=(
158
- "Sube un video corto o grábalo con la webcam haciendo una seña. "
159
- "El modelo LSTM procesa la secuencia de landmarks (pose + manos) y "
160
- "muestra la clase más probable y la distribución de probabilidades."
161
- ),
162
  )
163
 
164
- if __name__ == "__main__":
165
- demo.launch()
 
1
+ import gradio as gr
 
 
2
  import numpy as np
3
  import mediapipe as mp
4
+ import cv2
5
+ from tensorflow import keras
6
+ import json
7
+ import os
 
 
 
 
8
 
9
+ # === LOAD MODEL & LABELS ===
10
+ MODEL_PATH = "models/sign_model_lstm_v1.keras"
11
+ LABELS_PATH = "models/label_names.json"
12
 
13
+ print(f"TensorFlow version: {keras.__version__}")
14
  print("Cargando modelo desde:", MODEL_PATH)
15
+
16
+ model = keras.models.load_model(MODEL_PATH)
17
 
18
  with open(LABELS_PATH, "r") as f:
19
  label_names = json.load(f)
20
 
21
+ NUM_POINTS = (33 + 21 + 21) * 3 # pose + left + right = (33 + 21 + 21) landmarks, xyz values
 
 
 
 
22
 
23
+ mp_holistic = mp.solutions.holistic
24
 
25
  def extract_landmarks_from_results(results):
26
+ def get_xyz(landmarks, n):
 
 
 
 
27
  if landmarks is None:
28
+ data = [[0,0,0]] * n
29
  else:
30
  data = [[lm.x, lm.y, lm.z] for lm in landmarks]
31
+ if len(data) < n:
32
+ data += [[0,0,0]] * (n - len(data))
33
+ data = data[:n]
34
  return data
35
 
36
+ pose = get_xyz(results.pose_landmarks.landmark if results.pose_landmarks else None, 33)
37
+ left = get_xyz(results.left_hand_landmarks.landmark if results.left_hand_landmarks else None, 21)
38
+ right = get_xyz(results.right_hand_landmarks.landmark if results.right_hand_landmarks else None, 21)
 
 
 
 
 
 
 
 
 
39
 
40
+ all_points = pose + left + right
41
+ return np.array(all_points, dtype=np.float32).flatten()
42
 
43
+ def predict_video_lstm(video_path):
44
+ print("Procesando video:", video_path)
 
 
 
 
 
 
45
 
46
  cap = cv2.VideoCapture(video_path)
47
+ frames_landmarks = []
48
+
49
  with mp_holistic.Holistic(
50
  static_image_mode=False,
51
  model_complexity=1,
 
60
  if not ret:
61
  break
62
 
63
+ image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
64
+ results = holistic.process(image_rgb)
 
 
 
65
 
66
+ vec = extract_landmarks_from_results(results)
67
+ frames_landmarks.append(vec)
68
 
69
  cap.release()
70
 
71
+ if len(frames_landmarks) == 0:
72
+ return "No se encontraron landmarks", {}
 
 
73
 
74
+ # Pad / slice to fixed length (20 frames)
75
+ MAX_FRAMES = 20
76
+ if len(frames_landmarks) < MAX_FRAMES:
77
+ pad = [np.zeros(NUM_POINTS)] * (MAX_FRAMES - len(frames_landmarks))
78
+ frames_landmarks = frames_landmarks + pad
79
  else:
80
+ frames_landmarks = frames_landmarks[:MAX_FRAMES]
81
 
82
+ X = np.array(frames_landmarks, dtype=np.float32).reshape(1, MAX_FRAMES, NUM_POINTS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ probs = model.predict(X, verbose=0)[0]
85
  idx = int(np.argmax(probs))
86
+
87
+ prediction = label_names[idx]
88
+ confidences = {label_names[i]: float(probs[i]) for i in range(len(probs))}
 
89
 
90
+ return f"Predicción: {prediction}", confidences
 
91
 
92
+ # === GRADIO UI ===
 
93
 
94
  demo = gr.Interface(
95
  fn=predict_video_lstm,
96
  inputs=gr.Video(
97
+ sources=["upload", "webcam"],
98
+ label="Sube un video o graba tu seña",
99
+ format="mp4"
 
100
  ),
101
  outputs=[
102
  gr.Textbox(label="Resultado"),
103
+ gr.Label(label="Confianza por clase")
104
  ],
105
  title="LSP-EnSeñas - Demo LSTM",
106
  description=(
107
+ "Traductor de señas basado en LSTM + MediaPipe Holistic.\n"
108
+ "Sube un video corto o grábalo en vivo haciendo una seña.\n"
109
+ "El modelo procesará el movimiento (cuerpo + manos) y mostrará la predicción."
110
+ )
111
  )
112
 
113
+ demo.launch()