Spaces:

fabiosam
/

EnSenas

Sleeping

App Files Files Community

fabiosam commited on Nov 17, 2025

Commit

f9d7cbe

verified ·

1 Parent(s): 56da706

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -107

app.py CHANGED Viewed

@@ -1,77 +1,51 @@
-import os
-import json
-import cv2
 import numpy as np
 import mediapipe as mp
-import tensorflow as tf
-import gradio as gr
-print("TensorFlow version:", tf.__version__)
-# ==== RUTAS DEL MODELO ====
-BASE_DIR = os.path.dirname(__file__)
-MODELS_DIR = os.path.join(BASE_DIR, "models")
-MODEL_PATH = os.path.join(MODELS_DIR, "sign_model_lstm_v1.keras")
-LABELS_PATH = os.path.join(MODELS_DIR, "label_names.json")
 print("Cargando modelo desde:", MODEL_PATH)
-model = tf.keras.models.load_model(MODEL_PATH)
 with open(LABELS_PATH, "r") as f:
     label_names = json.load(f)
-mp_holistic = mp.solutions.holistic
-MAX_FRAMES = 20          # mismo valor que usaste al entrenar
-N_FEATURES = 225         # 75 puntos * 3 coords (x, y, z)
-# ========= FUNCIONES DE PROCESADO =========
 def extract_landmarks_from_results(results):
-    """
-    Convierte los resultados de MediaPipe Holistic en un vector 1D (225,)
-    con pose (33), mano izq (21) y mano der (21).
-    """
-    def get_xyz(landmarks, n_points):
         if landmarks is None:
-            data = [[0.0, 0.0, 0.0]] * n_points
         else:
             data = [[lm.x, lm.y, lm.z] for lm in landmarks]
-            if len(data) < n_points:
-                data += [[0.0, 0.0, 0.0]] * (n_points - len(data))
-            data = data[:n_points]
         return data
-    pose = get_xyz(
-        results.pose_landmarks.landmark if results.pose_landmarks else None,
-        33
-    )
-    left_hand = get_xyz(
-        results.left_hand_landmarks.landmark if results.left_hand_landmarks else None,
-        21
-    )
-    right_hand = get_xyz(
-        results.right_hand_landmarks.landmark if results.right_hand_landmarks else None,
-        21
-    )
-    all_points = pose + left_hand + right_hand
-    return np.array(all_points, dtype=np.float32).flatten()  # (225,)
-def preprocess_video_to_sequence(video_path, max_frames=MAX_FRAMES):
-    """
-    Lee un video, extrae landmarks por frame y devuelve
-    una secuencia (1, max_frames, 225) lista para el LSTM.
-    """
-    if video_path is None:
-        raise ValueError("No se recibió ruta de video.")
     cap = cv2.VideoCapture(video_path)
-    frames_feats = []
     with mp_holistic.Holistic(
         static_image_mode=False,
         model_complexity=1,
@@ -86,80 +60,54 @@ def preprocess_video_to_sequence(video_path, max_frames=MAX_FRAMES):
             if not ret:
                 break
-            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            results = holistic.process(frame_rgb)
-            vec = extract_landmarks_from_results(results)  # (225,)
-            frames_feats.append(vec)
-            if len(frames_feats) >= max_frames:
-                break
     cap.release()
-    if len(frames_feats) == 0:
-        raise ValueError("El video no tiene frames válidos para procesar.")
-    seq = np.array(frames_feats, dtype=np.float32)
-    # padding / recorte
-    if seq.shape[0] < max_frames:
-        pad_len = max_frames - seq.shape[0]
-        pad = np.zeros((pad_len, seq.shape[1]), dtype=np.float32)
-        seq = np.concatenate([seq, pad], axis=0)
     else:
-        seq = seq[:max_frames, :]
-    seq = seq.reshape(1, max_frames, seq.shape[1])  # (1, T, 225)
-    return seq
-def predict_video_lstm(video_path):
-    """
-    Función que usa Gradio:
-    - Recibe la ruta de un video
-    - Devuelve la predicción principal + distribución de probabilidades
-    """
-    if video_path is None:
-        return "Sube o graba un video primero.", {}
-    try:
-        seq = preprocess_video_to_sequence(video_path, max_frames=MAX_FRAMES)
-        probs = model.predict(seq, verbose=0)[0]  # (num_classes,)
-    except Exception as e:
-        return f"Error procesando el video: {e}", {}
     idx = int(np.argmax(probs))
-    label = label_names[idx]
-    conf = float(probs[idx])
-    prob_dict = {name: float(probs[i]) for i, name in enumerate(label_names)}
-    texto = f"Predicción: {label} (confianza {conf:.2f})"
-    return texto, prob_dict
-# ========= INTERFAZ GRADIO =========
 demo = gr.Interface(
     fn=predict_video_lstm,
     inputs=gr.Video(
-        sources=["upload", "webcam"],   # 👈 AQUÍ está la diferencia: sin 'source'
-        label="Video de seña (sube o graba)",
-        format="mp4",
-        type="filepath"                 # Gradio le pasa a la función la ruta del archivo
     ),
     outputs=[
         gr.Textbox(label="Resultado"),
-        gr.Label(label="Probabilidades por clase")
     ],
     title="LSP-EnSeñas - Demo LSTM",
     description=(
-        "Sube un video corto o grábalo con la webcam haciendo una seña. "
-        "El modelo LSTM procesa la secuencia de landmarks (pose + manos) y "
-        "muestra la clase más probable y la distribución de probabilidades."
-    ),
 )
-if __name__ == "__main__":
-    demo.launch()

+import gradio as gr
 import numpy as np
 import mediapipe as mp
+import cv2
+from tensorflow import keras
+import json
+import os
+# === LOAD MODEL & LABELS ===
+MODEL_PATH = "models/sign_model_lstm_v1.keras"
+LABELS_PATH = "models/label_names.json"
+print(f"TensorFlow version: {keras.__version__}")
 print("Cargando modelo desde:", MODEL_PATH)
+model = keras.models.load_model(MODEL_PATH)
 with open(LABELS_PATH, "r") as f:
     label_names = json.load(f)
+NUM_POINTS = (33 + 21 + 21) * 3  # pose + left + right = (33 + 21 + 21) landmarks, xyz values
+mp_holistic = mp.solutions.holistic
 def extract_landmarks_from_results(results):
+    def get_xyz(landmarks, n):
         if landmarks is None:
+            data = [[0,0,0]] * n
         else:
             data = [[lm.x, lm.y, lm.z] for lm in landmarks]
+            if len(data) < n:
+                data += [[0,0,0]] * (n - len(data))
+            data = data[:n]
         return data
+    pose = get_xyz(results.pose_landmarks.landmark if results.pose_landmarks else None, 33)
+    left = get_xyz(results.left_hand_landmarks.landmark if results.left_hand_landmarks else None, 21)
+    right = get_xyz(results.right_hand_landmarks.landmark if results.right_hand_landmarks else None, 21)
+    all_points = pose + left + right
+    return np.array(all_points, dtype=np.float32).flatten()
+def predict_video_lstm(video_path):
+    print("Procesando video:", video_path)
     cap = cv2.VideoCapture(video_path)
+    frames_landmarks = []
     with mp_holistic.Holistic(
         static_image_mode=False,
         model_complexity=1,
             if not ret:
                 break
+            image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            results = holistic.process(image_rgb)
+            vec = extract_landmarks_from_results(results)
+            frames_landmarks.append(vec)
     cap.release()
+    if len(frames_landmarks) == 0:
+        return "No se encontraron landmarks", {}
+    # Pad / slice to fixed length (20 frames)
+    MAX_FRAMES = 20
+    if len(frames_landmarks) < MAX_FRAMES:
+        pad = [np.zeros(NUM_POINTS)] * (MAX_FRAMES - len(frames_landmarks))
+        frames_landmarks = frames_landmarks + pad
     else:
+        frames_landmarks = frames_landmarks[:MAX_FRAMES]
+    X = np.array(frames_landmarks, dtype=np.float32).reshape(1, MAX_FRAMES, NUM_POINTS)
+    probs = model.predict(X, verbose=0)[0]
     idx = int(np.argmax(probs))
+    prediction = label_names[idx]
+    confidences = {label_names[i]: float(probs[i]) for i in range(len(probs))}
+    return f"Predicción: {prediction}", confidences
+# === GRADIO UI ===
 demo = gr.Interface(
     fn=predict_video_lstm,
     inputs=gr.Video(
+        sources=["upload", "webcam"],
+        label="Sube un video o graba tu seña",
+        format="mp4"
     ),
     outputs=[
         gr.Textbox(label="Resultado"),
+        gr.Label(label="Confianza por clase")
     ],
     title="LSP-EnSeñas - Demo LSTM",
     description=(
+        "Traductor de señas basado en LSTM + MediaPipe Holistic.\n"
+        "Sube un video corto o grábalo en vivo haciendo una seña.\n"
+        "El modelo procesará el movimiento (cuerpo + manos) y mostrará la predicción."
+    )
 )
+demo.launch()