|
|
import os |
|
|
import json |
|
|
import cv2 |
|
|
import numpy as np |
|
|
import mediapipe as mp |
|
|
import tensorflow as tf |
|
|
import gradio as gr |
|
|
|
|
|
print("TensorFlow version:", tf.__version__) |
|
|
|
|
|
|
|
|
BASE_DIR = os.path.dirname(__file__) |
|
|
MODELS_DIR = os.path.join(BASE_DIR, "models") |
|
|
|
|
|
MODEL_PATH = os.path.join(MODELS_DIR, "sign_model_lstm_v1.keras") |
|
|
LABELS_PATH = os.path.join(MODELS_DIR, "label_names.json") |
|
|
|
|
|
print("Cargando modelo desde:", MODEL_PATH) |
|
|
model = tf.keras.models.load_model(MODEL_PATH) |
|
|
|
|
|
with open(LABELS_PATH, "r") as f: |
|
|
label_names = json.load(f) |
|
|
|
|
|
mp_holistic = mp.solutions.holistic |
|
|
|
|
|
MAX_FRAMES = 20 |
|
|
N_FEATURES = 225 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_landmarks_from_results(results): |
|
|
""" |
|
|
Convierte los resultados de MediaPipe Holistic en un vector 1D (225,) |
|
|
con pose (33), mano izq (21) y mano der (21). |
|
|
""" |
|
|
def get_xyz(landmarks, n_points): |
|
|
if landmarks is None: |
|
|
data = [[0.0, 0.0, 0.0]] * n_points |
|
|
else: |
|
|
data = [[lm.x, lm.y, lm.z] for lm in landmarks] |
|
|
if len(data) < n_points: |
|
|
data += [[0.0, 0.0, 0.0]] * (n_points - len(data)) |
|
|
data = data[:n_points] |
|
|
return data |
|
|
|
|
|
pose = get_xyz( |
|
|
results.pose_landmarks.landmark if results.pose_landmarks else None, |
|
|
33 |
|
|
) |
|
|
left_hand = get_xyz( |
|
|
results.left_hand_landmarks.landmark if results.left_hand_landmarks else None, |
|
|
21 |
|
|
) |
|
|
right_hand = get_xyz( |
|
|
results.right_hand_landmarks.landmark if results.right_hand_landmarks else None, |
|
|
21 |
|
|
) |
|
|
|
|
|
all_points = pose + left_hand + right_hand |
|
|
return np.array(all_points, dtype=np.float32).flatten() |
|
|
|
|
|
|
|
|
def preprocess_video_to_sequence(video_path, max_frames=MAX_FRAMES): |
|
|
""" |
|
|
Lee un video, extrae landmarks por frame y devuelve |
|
|
una secuencia (1, max_frames, 225) lista para el LSTM. |
|
|
""" |
|
|
if video_path is None: |
|
|
raise ValueError("No se recibi贸 ruta de video.") |
|
|
|
|
|
cap = cv2.VideoCapture(video_path) |
|
|
frames_feats = [] |
|
|
|
|
|
with mp_holistic.Holistic( |
|
|
static_image_mode=False, |
|
|
model_complexity=1, |
|
|
enable_segmentation=False, |
|
|
refine_face_landmarks=False, |
|
|
min_detection_confidence=0.5, |
|
|
min_tracking_confidence=0.5 |
|
|
) as holistic: |
|
|
|
|
|
while True: |
|
|
ret, frame = cap.read() |
|
|
if not ret: |
|
|
break |
|
|
|
|
|
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
|
results = holistic.process(frame_rgb) |
|
|
|
|
|
vec = extract_landmarks_from_results(results) |
|
|
frames_feats.append(vec) |
|
|
|
|
|
if len(frames_feats) >= max_frames: |
|
|
break |
|
|
|
|
|
cap.release() |
|
|
|
|
|
if len(frames_feats) == 0: |
|
|
raise ValueError("El video no tiene frames v谩lidos para procesar.") |
|
|
|
|
|
seq = np.array(frames_feats, dtype=np.float32) |
|
|
|
|
|
|
|
|
if seq.shape[0] < max_frames: |
|
|
pad_len = max_frames - seq.shape[0] |
|
|
pad = np.zeros((pad_len, seq.shape[1]), dtype=np.float32) |
|
|
seq = np.concatenate([seq, pad], axis=0) |
|
|
else: |
|
|
seq = seq[:max_frames, :] |
|
|
|
|
|
seq = seq.reshape(1, max_frames, seq.shape[1]) |
|
|
return seq |
|
|
|
|
|
|
|
|
def predict_video_lstm(video_path): |
|
|
""" |
|
|
Funci贸n que usa Gradio: |
|
|
- Recibe la ruta de un video |
|
|
- Devuelve la predicci贸n principal + distribuci贸n de probabilidades |
|
|
""" |
|
|
if video_path is None: |
|
|
return "Sube o graba un video primero.", {} |
|
|
|
|
|
try: |
|
|
seq = preprocess_video_to_sequence(video_path, max_frames=MAX_FRAMES) |
|
|
probs = model.predict(seq, verbose=0)[0] |
|
|
except Exception as e: |
|
|
return f"Error procesando el video: {e}", {} |
|
|
|
|
|
idx = int(np.argmax(probs)) |
|
|
label = label_names[idx] |
|
|
conf = float(probs[idx]) |
|
|
|
|
|
prob_dict = {name: float(probs[i]) for i, name in enumerate(label_names)} |
|
|
|
|
|
texto = f"Predicci贸n: {label} (confianza {conf:.2f})" |
|
|
return texto, prob_dict |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=predict_video_lstm, |
|
|
inputs=gr.Video( |
|
|
sources=["upload", "webcam"], |
|
|
label="Video de se帽a (sube o graba)", |
|
|
format="mp4", |
|
|
type="filepath" |
|
|
), |
|
|
outputs=[ |
|
|
gr.Textbox(label="Resultado"), |
|
|
gr.Label(label="Probabilidades por clase") |
|
|
], |
|
|
title="LSP-EnSe帽as - Demo LSTM", |
|
|
description=( |
|
|
"Sube un video corto o gr谩balo con la webcam haciendo una se帽a. " |
|
|
"El modelo LSTM procesa la secuencia de landmarks (pose + manos) y " |
|
|
"muestra la clase m谩s probable y la distribuci贸n de probabilidades." |
|
|
), |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|