Spaces:

fabiosam
/

EnSenas

Sleeping

App Files Files Community

fabiosam commited on Nov 17, 2025

Commit

fc2715e

verified ·

1 Parent(s): 676c99a

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -68

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import os
 import json
-import io
 import cv2
 import numpy as np
 import mediapipe as mp
@@ -10,7 +8,6 @@ from tensorflow import keras
 import gradio as gr
 import matplotlib.pyplot as plt
-from PIL import Image
 print("TensorFlow version:", tf.__version__)
@@ -30,13 +27,13 @@ with open(LABELS_PATH, "r") as f:
 num_classes = len(label_names)
 MAX_FRAMES = 20  # mismo valor que usaste al entrenar
 # ==== MEDIAPIPE ====
 mp_holistic = mp.solutions.holistic
 mp_drawing = mp.solutions.drawing_utils
 mp_styles = mp.solutions.drawing_styles
-# ---- 1. EXTRAER LANDMARKS COMO VECTOR (IGUAL QUE EN EL ENTRENAMIENTO) ----
 def extract_landmarks_from_results(results):
     """
     Convierte los resultados de MediaPipe Holistic en un vector 1D (225,)
@@ -61,19 +58,30 @@ def extract_landmarks_from_results(results):
     return np.array(all_points, dtype=np.float32).flatten()  # (225,)
-# ---- 2. PROCESAR VIDEO -> SECUENCIA + FRAME CON LANDMARKS ----
-def video_to_sequence_and_landmarks_frame(video_path, max_frames=MAX_FRAMES):
     """
     Procesa un video:
       - Devuelve la secuencia (1, max_frames, 225) para el LSTM
-      - Devuelve una imagen (PIL.Image) con los landmarks dibujados
-        en el primer frame donde se detecte algo.
     """
     cap = cv2.VideoCapture(video_path)
     frames_feats = []
-    frame_for_vis = None
-    results_for_vis = None
     with mp_holistic.Holistic(
         static_image_mode=False,
@@ -84,6 +92,7 @@ def video_to_sequence_and_landmarks_frame(video_path, max_frames=MAX_FRAMES):
         min_tracking_confidence=0.5
     ) as holistic:
         while True:
             ret, frame = cap.read()
             if not ret:
@@ -92,29 +101,48 @@ def video_to_sequence_and_landmarks_frame(video_path, max_frames=MAX_FRAMES):
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             results = holistic.process(frame_rgb)
-            vec = extract_landmarks_from_results(results)
-            frames_feats.append(vec)
-            # Guardamos el primer frame donde se detecta algo para visualizar
-            if frame_for_vis is None and (
-                results.pose_landmarks or
-                results.left_hand_landmarks or
-                results.right_hand_landmarks
-            ):
-                frame_for_vis = frame.copy()
-                results_for_vis = results
-            if len(frames_feats) >= max_frames:
-                break
     cap.release()
     if len(frames_feats) == 0:
         raise ValueError("No se pudieron leer frames válidos del video.")
     seq = np.array(frames_feats, dtype=np.float32)
-    # Padding / recorte
     if seq.shape[0] < max_frames:
         pad_len = max_frames - seq.shape[0]
         pad = np.zeros((pad_len, seq.shape[1]), dtype=np.float32)
@@ -124,63 +152,35 @@ def video_to_sequence_and_landmarks_frame(video_path, max_frames=MAX_FRAMES):
     seq = seq.reshape(1, max_frames, seq.shape[1])  # (1, T, 225)
-    # --- crear imagen con landmarks ---
-    landmarks_image = None
-    if frame_for_vis is not None and results_for_vis is not None:
-        annotated = frame_for_vis.copy()
-        mp_drawing.draw_landmarks(
-            annotated,
-            results_for_vis.pose_landmarks,
-            mp_holistic.POSE_CONNECTIONS,
-            landmark_drawing_spec=mp_styles.get_default_pose_landmarks_style()
-        )
-        mp_drawing.draw_landmarks(
-            annotated,
-            results_for_vis.left_hand_landmarks,
-            mp_holistic.HAND_CONNECTIONS,
-            landmark_drawing_spec=mp_styles.get_default_hand_landmarks_style()
-        )
-        mp_drawing.draw_landmarks(
-            annotated,
-            results_for_vis.right_hand_landmarks,
-            mp_holistic.HAND_CONNECTIONS,
-            landmark_drawing_spec=mp_styles.get_default_hand_landmarks_style()
-        )
-        annotated_rgb = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB)
-        landmarks_image = Image.fromarray(annotated_rgb)
-    return seq, landmarks_image
-# ---- 3. PREDICCIÓN + GRÁFICO ----
 def predict_from_video(video):
     """
-    Función que usa Gradio:
-      - recibe ruta del video (upload o webcam)
-      - devuelve: texto con predicción, plot de barras, imagen con landmarks
     """
     if video is None:
         return "Sube un video o grábalo primero.", None, None
-    # Gradio pasa un dict con la ruta en 'video'
     if isinstance(video, dict) and "video" in video:
         video_path = video["video"]
     else:
         video_path = video
-    seq, landmarks_image = video_to_sequence_and_landmarks_frame(video_path, MAX_FRAMES)
     probs = model.predict(seq, verbose=0)[0]  # (num_classes,)
     idx = int(np.argmax(probs))
     label = label_names[idx]
     conf = float(probs[idx])
-    # Texto de salida
     text_pred = f"Predicción: {label} (confianza {conf:.2f})"
-    # Gráfico de barras con las probabilidades
     fig, ax = plt.subplots(figsize=(6, 3))
     ax.bar(range(len(label_names)), probs)
     ax.set_xticks(range(len(label_names)))
@@ -190,16 +190,16 @@ def predict_from_video(video):
     ax.set_title("Confianza por clase")
     plt.tight_layout()
-    return text_pred, fig, landmarks_image
-# ---- 4. INTERFAZ GRADIO ----
 title = "LSP-EnSeñas - Demo LSTM"
 description = (
     "Traductor de señas basado en LSTM + MediaPipe Holistic. "
     "Sube un video corto o grábalo en vivo haciendo una seña. "
-    "El modelo procesará el movimiento (cuerpo + manos) y mostrará la predicción. "
-    "También verás un frame con los puntos (landmarks) detectados por MediaPipe."
 )
 demo = gr.Interface(
@@ -208,7 +208,7 @@ demo = gr.Interface(
     outputs=[
         gr.Textbox(label="Predicción del modelo"),
         gr.Plot(label="Confianza por clase"),
-        gr.Image(type="pil", label="Landmarks detectados (ejemplo de frame)")
     ],
     title=title,
     description=description,

 import os
 import json
 import cv2
 import numpy as np
 import mediapipe as mp
 import gradio as gr
 import matplotlib.pyplot as plt
 print("TensorFlow version:", tf.__version__)
 num_classes = len(label_names)
 MAX_FRAMES = 20  # mismo valor que usaste al entrenar
 # ==== MEDIAPIPE ====
 mp_holistic = mp.solutions.holistic
 mp_drawing = mp.solutions.drawing_utils
 mp_styles = mp.solutions.drawing_styles
 def extract_landmarks_from_results(results):
     """
     Convierte los resultados de MediaPipe Holistic en un vector 1D (225,)
     return np.array(all_points, dtype=np.float32).flatten()  # (225,)
+def video_to_sequence_and_annotated(video_path, max_frames=MAX_FRAMES):
     """
     Procesa un video:
       - Devuelve la secuencia (1, max_frames, 225) para el LSTM
+      - Devuelve la ruta de un nuevo video con los landmarks dibujados.
     """
     cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"No se pudo abrir el video: {video_path}")
     frames_feats = []
+    # Info del video de entrada
+    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps    = cap.get(cv2.CAP_PROP_FPS)
+    if fps is None or fps <= 0:
+        fps = 25.0
+    # Ruta temporal para el video anotado
+    out_path = os.path.join("/tmp", "annotated_output.mp4")
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
     with mp_holistic.Holistic(
         static_image_mode=False,
         min_tracking_confidence=0.5
     ) as holistic:
+        frame_idx = 0
         while True:
             ret, frame = cap.read()
             if not ret:
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             results = holistic.process(frame_rgb)
+            # Features para el modelo (solo guardamos hasta max_frames)
+            if len(frames_feats) < max_frames:
+                vec = extract_landmarks_from_results(results)
+                frames_feats.append(vec)
+            # DIBUJAR LANDMARKS SOBRE EL FRAME
+            annotated = frame.copy()
+            if results.pose_landmarks:
+                mp_drawing.draw_landmarks(
+                    annotated,
+                    results.pose_landmarks,
+                    mp_holistic.POSE_CONNECTIONS,
+                    landmark_drawing_spec=mp_styles.get_default_pose_landmarks_style()
+                )
+            if results.left_hand_landmarks:
+                mp_drawing.draw_landmarks(
+                    annotated,
+                    results.left_hand_landmarks,
+                    mp_holistic.HAND_CONNECTIONS,
+                    landmark_drawing_spec=mp_styles.get_default_hand_landmarks_style()
+                )
+            if results.right_hand_landmarks:
+                mp_drawing.draw_landmarks(
+                    annotated,
+                    results.right_hand_landmarks,
+                    mp_holistic.HAND_CONNECTIONS,
+                    landmark_drawing_spec=mp_styles.get_default_hand_landmarks_style()
+                )
+            writer.write(annotated)
+            frame_idx += 1
     cap.release()
+    writer.release()
     if len(frames_feats) == 0:
         raise ValueError("No se pudieron leer frames válidos del video.")
     seq = np.array(frames_feats, dtype=np.float32)
+    # Padding / recorte para el LSTM
     if seq.shape[0] < max_frames:
         pad_len = max_frames - seq.shape[0]
         pad = np.zeros((pad_len, seq.shape[1]), dtype=np.float32)
     seq = seq.reshape(1, max_frames, seq.shape[1])  # (1, T, 225)
+    return seq, out_path
 def predict_from_video(video):
     """
+    Función llamada por Gradio.
+      - Recibe ruta del video (upload o webcam).
+      - Devuelve: texto, gráfico de barras, video con landmarks.
     """
     if video is None:
         return "Sube un video o grábalo primero.", None, None
+    # Gradio a veces pasa dict con 'video'
     if isinstance(video, dict) and "video" in video:
         video_path = video["video"]
     else:
         video_path = video
+    seq, annotated_path = video_to_sequence_and_annotated(video_path, MAX_FRAMES)
     probs = model.predict(seq, verbose=0)[0]  # (num_classes,)
     idx = int(np.argmax(probs))
     label = label_names[idx]
     conf = float(probs[idx])
+    # Texto
     text_pred = f"Predicción: {label} (confianza {conf:.2f})"
+    # Gráfico de barras
     fig, ax = plt.subplots(figsize=(6, 3))
     ax.bar(range(len(label_names)), probs)
     ax.set_xticks(range(len(label_names)))
     ax.set_title("Confianza por clase")
     plt.tight_layout()
+    # annotated_path es la ruta del video con landmarks
+    return text_pred, fig, annotated_path
 title = "LSP-EnSeñas - Demo LSTM"
 description = (
     "Traductor de señas basado en LSTM + MediaPipe Holistic. "
     "Sube un video corto o grábalo en vivo haciendo una seña. "
+    "El modelo procesará el movimiento (cuerpo + manos), mostrará la predicción "
+    "y devolverá tu video con los puntos (landmarks) dibujados."
 )
 demo = gr.Interface(
     outputs=[
         gr.Textbox(label="Predicción del modelo"),
         gr.Plot(label="Confianza por clase"),
+        gr.Video(label="Video con landmarks detectados"),
     ],
     title=title,
     description=description,