fabiosam commited on
Commit
af4ab3b
·
verified ·
1 Parent(s): f52c4ac

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -0
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import json
4
+ import numpy as np
5
+ import mediapipe as mp
6
+ import tensorflow as tf
7
+ from tensorflow import keras
8
+ import gradio as gr
9
+
10
+ # =========================
11
+ # CONFIGURACIÓN BÁSICA
12
+ # =========================
13
+
14
+ MAX_FRAMES = 20 # debe ser el mismo valor que usaste al entrenar
15
+ MODEL_DIR = "models"
16
+
17
+ MODEL_PATH = os.path.join(MODEL_DIR, "sign_model_lstm_v1.keras")
18
+ LABELS_PATH = os.path.join(MODEL_DIR, "label_names.json")
19
+
20
+ print("TensorFlow version:", tf.__version__)
21
+ print("Cargando modelo desde:", MODEL_PATH)
22
+
23
+ # Carga del modelo LSTM
24
+ model = keras.models.load_model(MODEL_PATH)
25
+
26
+ # Carga de nombres de clase
27
+ with open(LABELS_PATH, "r") as f:
28
+ label_names = json.load(f)
29
+
30
+ mp_holistic = mp.solutions.holistic
31
+
32
+
33
+ # =========================
34
+ # EXTRACCIÓN DE LANDMARKS
35
+ # =========================
36
+
37
+ def extract_landmarks_from_results(results):
38
+ """
39
+ Convierte los resultados de MediaPipe Holistic en un vector 1D.
40
+ Pose (33), mano izq (21), mano der (21) -> 75 puntos.
41
+ Cada punto = (x, y, z) => 75 * 3 = 225 features.
42
+ """
43
+ def get_xyz(landmarks, n_points):
44
+ if landmarks is None:
45
+ data = [[0.0, 0.0, 0.0]] * n_points
46
+ else:
47
+ data = [[lm.x, lm.y, lm.z] for lm in landmarks]
48
+ if len(data) < n_points:
49
+ data += [[0.0, 0.0, 0.0]] * (n_points - len(data))
50
+ data = data[:n_points]
51
+ return data
52
+
53
+ pose = get_xyz(
54
+ results.pose_landmarks.landmark if results.pose_landmarks else None,
55
+ 33
56
+ )
57
+ left_hand = get_xyz(
58
+ results.left_hand_landmarks.landmark if results.left_hand_landmarks else None,
59
+ 21
60
+ )
61
+ right_hand = get_xyz(
62
+ results.right_hand_landmarks.landmark if results.right_hand_landmarks else None,
63
+ 21
64
+ )
65
+
66
+ all_points = pose + left_hand + right_hand
67
+ return np.array(all_points, dtype=np.float32).flatten() # (225,)
68
+
69
+
70
+ # =========================
71
+ # PROCESAR VIDEO -> SECUENCIA
72
+ # =========================
73
+
74
+ def preprocess_video_to_sequence(video_path, max_frames=MAX_FRAMES):
75
+ """
76
+ Procesa un video (archivo .mp4, .mov, etc.) con MediaPipe Holistic
77
+ y devuelve una secuencia (1, max_frames, 225) lista para el modelo.
78
+ """
79
+ cap = cv2.VideoCapture(video_path)
80
+
81
+ frames_feats = []
82
+ with mp_holistic.Holistic(
83
+ static_image_mode=False,
84
+ model_complexity=1,
85
+ enable_segmentation=False,
86
+ refine_face_landmarks=False,
87
+ min_detection_confidence=0.5,
88
+ min_tracking_confidence=0.5
89
+ ) as holistic:
90
+
91
+ while True:
92
+ ret, frame = cap.read()
93
+ if not ret:
94
+ break
95
+
96
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
97
+ results = holistic.process(frame_rgb)
98
+
99
+ vec = extract_landmarks_from_results(results)
100
+ frames_feats.append(vec)
101
+
102
+ if len(frames_feats) >= max_frames:
103
+ break
104
+
105
+ cap.release()
106
+
107
+ if len(frames_feats) == 0:
108
+ raise ValueError("El video no tiene frames válidos para procesar.")
109
+
110
+ seq = np.array(frames_feats, dtype=np.float32)
111
+
112
+ # Padding o recorte a max_frames
113
+ if seq.shape[0] < max_frames:
114
+ pad_len = max_frames - seq.shape[0]
115
+ pad = np.zeros((pad_len, seq.shape[1]), dtype=np.float32)
116
+ seq = np.concatenate([seq, pad], axis=0)
117
+ else:
118
+ seq = seq[:max_frames, :]
119
+
120
+ seq = seq.reshape(1, max_frames, seq.shape[1]) # (1, T, 225)
121
+ return seq
122
+
123
+
124
+ # =========================
125
+ # FUNCIÓN DE PREDICCIÓN PARA GRADIO
126
+ # =========================
127
+
128
+ def predict_sign(video):
129
+ """
130
+ Gradio pasa 'video' como ruta al archivo temporal (.mp4) grabado o subido.
131
+ """
132
+ if video is None:
133
+ return "Sube o graba un video primero.", {}
134
+
135
+ try:
136
+ seq = preprocess_video_to_sequence(video, max_frames=MAX_FRAMES)
137
+
138
+ probs = model.predict(seq, verbose=0)[0] # (num_classes,)
139
+ idx = int(np.argmax(probs))
140
+ label = label_names[idx]
141
+ conf = float(probs[idx])
142
+
143
+ # Para mostrar distribución de probabilidades en Gradio:
144
+ probs_dict = {
145
+ name: float(probs[i])
146
+ for i, name in enumerate(label_names)
147
+ }
148
+
149
+ result_text = f"Seña predicha: {label} (confianza {conf:.2f})"
150
+ return result_text, probs_dict
151
+
152
+ except Exception as e:
153
+ return f"Error procesando el video: {str(e)}", {}
154
+
155
+
156
+ # =========================
157
+ # INTERFAZ GRADIO
158
+ # =========================
159
+
160
+ title = "LSP-EnSeñas - Demo LSTM"
161
+ description = """
162
+ Sube o graba un video corto haciendo una seña (por ejemplo, uno de los colores
163
+ que se usaron en el entrenamiento). El modelo LSTM analiza la secuencia de
164
+ landmarks (cuerpo y manos) usando MediaPipe Holistic y predice la clase más probable.
165
+ """
166
+
167
+ demo = gr.Interface(
168
+ fn=predict_sign,
169
+ inputs=gr.Video(
170
+ source="webcam", # también permite subir archivo
171
+ label="Video de la seña (webcam o upload)"
172
+ ),
173
+ outputs=[
174
+ gr.Textbox(label="Resultado"),
175
+ gr.Label(label="Probabilidades por clase")
176
+ ],
177
+ title=title,
178
+ description=description,
179
+ allow_flagging="never"
180
+ )
181
+
182
+ if __name__ == "__main__":
183
+ demo.launch()