# app.py import gradio as gr import mediapipe as mp import numpy as np import joblib import tensorflow as tf from collections import deque import cv2 SEQ_LEN = 30 MODEL_PATH = "gesture_lstm.h5" LABELS_PATH = "labels.joblib" mp_hands = mp.solutions.hands # load model and labels model = tf.keras.models.load_model(MODEL_PATH) le = joblib.load(LABELS_PATH) # buffer to hold sequence of vectors (global, per app instance) buffer = deque(maxlen=SEQ_LEN) def extract_landmarks_from_image(img): # img is in RGB (gradio webcam returns RGB) with mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands: res = hands.process(img) if res.multi_hand_landmarks: lm = res.multi_hand_landmarks[0] vec = [] for p in lm.landmark: vec.extend([p.x, p.y, p.z]) return np.array(vec, dtype=np.float32), res else: return np.zeros(21*3, dtype=np.float32), res # We will keep a persistent mediapipe Hands object across calls for speed: mp_hands_persistent = mp.solutions.hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5, min_tracking_confidence=0.5) def predict_frame(frame): # frame: numpy array RGB from gradio webcam # returns image with overlay and predicted label + confidence global buffer, mp_hands_persistent image = frame.copy() # MediaPipe expects RGB — gradio already gives RGB res = mp_hands_persistent.process(image) if res.multi_hand_landmarks: lm = res.multi_hand_landmarks[0] vec = [] for p in lm.landmark: vec.extend([p.x, p.y, p.z]) vec = np.array(vec, dtype=np.float32) # draw landmarks on image (convert to BGR for cv2 drawing) img_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) mp.solutions.drawing_utils.draw_landmarks(img_bgr, lm, mp_hands.HAND_CONNECTIONS) image = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB) else: vec = np.zeros(21*3, dtype=np.float32) buffer.append(vec) label_text = "No prediction (buffering...)" confidence = 0.0 if len(buffer) == SEQ_LEN: seq = np.stack(buffer, axis=0) # (seq_len, features) # normalize sample mean = seq.mean(axis=0) std = seq.std(axis=0) + 1e-8 seq = (seq - mean) / std seq = np.expand_dims(seq, axis=0) # (1, seq_len, features) probs = model.predict(seq, verbose=0)[0] idx = np.argmax(probs) label = le.inverse_transform([idx])[0] confidence = float(probs[idx]) label_text = f"{label} ({confidence*100:.1f}%)" return image, label_text # Gradio UI with gr.Blocks() as demo: gr.Markdown("## Air Hacking / Security Gesture Simulator\nPoint your webcam and perform a stored gesture. The model predicts after it has collected enough frames.") with gr.Row(): webcam = gr.Image(source="webcam", streaming=True, tool="none", type="numpy") output_label = gr.Textbox(label="Prediction") def process_frame(frame): img, label = predict_frame(frame) return img, label webcam.stream(fn=process_frame, outputs=[webcam, output_label], every=0.06) gr.Markdown("**Security notes:** Use this demo for demonstration only. See README for production hardening tips.") if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)