kuldeep0204's picture
Create app.py
0555651 verified
# app.py
import gradio as gr
import mediapipe as mp
import numpy as np
import joblib
import tensorflow as tf
from collections import deque
import cv2
SEQ_LEN = 30
MODEL_PATH = "gesture_lstm.h5"
LABELS_PATH = "labels.joblib"
mp_hands = mp.solutions.hands
# load model and labels
model = tf.keras.models.load_model(MODEL_PATH)
le = joblib.load(LABELS_PATH)
# buffer to hold sequence of vectors (global, per app instance)
buffer = deque(maxlen=SEQ_LEN)
def extract_landmarks_from_image(img):
# img is in RGB (gradio webcam returns RGB)
with mp_hands.Hands(static_image_mode=False,
max_num_hands=1,
min_detection_confidence=0.5,
min_tracking_confidence=0.5) as hands:
res = hands.process(img)
if res.multi_hand_landmarks:
lm = res.multi_hand_landmarks[0]
vec = []
for p in lm.landmark:
vec.extend([p.x, p.y, p.z])
return np.array(vec, dtype=np.float32), res
else:
return np.zeros(21*3, dtype=np.float32), res
# We will keep a persistent mediapipe Hands object across calls for speed:
mp_hands_persistent = mp.solutions.hands.Hands(static_image_mode=False,
max_num_hands=1,
min_detection_confidence=0.5,
min_tracking_confidence=0.5)
def predict_frame(frame):
# frame: numpy array RGB from gradio webcam
# returns image with overlay and predicted label + confidence
global buffer, mp_hands_persistent
image = frame.copy()
# MediaPipe expects RGB β€” gradio already gives RGB
res = mp_hands_persistent.process(image)
if res.multi_hand_landmarks:
lm = res.multi_hand_landmarks[0]
vec = []
for p in lm.landmark:
vec.extend([p.x, p.y, p.z])
vec = np.array(vec, dtype=np.float32)
# draw landmarks on image (convert to BGR for cv2 drawing)
img_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
mp.solutions.drawing_utils.draw_landmarks(img_bgr, lm, mp_hands.HAND_CONNECTIONS)
image = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
else:
vec = np.zeros(21*3, dtype=np.float32)
buffer.append(vec)
label_text = "No prediction (buffering...)"
confidence = 0.0
if len(buffer) == SEQ_LEN:
seq = np.stack(buffer, axis=0) # (seq_len, features)
# normalize sample
mean = seq.mean(axis=0)
std = seq.std(axis=0) + 1e-8
seq = (seq - mean) / std
seq = np.expand_dims(seq, axis=0) # (1, seq_len, features)
probs = model.predict(seq, verbose=0)[0]
idx = np.argmax(probs)
label = le.inverse_transform([idx])[0]
confidence = float(probs[idx])
label_text = f"{label} ({confidence*100:.1f}%)"
return image, label_text
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("## Air Hacking / Security Gesture Simulator\nPoint your webcam and perform a stored gesture. The model predicts after it has collected enough frames.")
with gr.Row():
webcam = gr.Image(source="webcam", streaming=True, tool="none", type="numpy")
output_label = gr.Textbox(label="Prediction")
def process_frame(frame):
img, label = predict_frame(frame)
return img, label
webcam.stream(fn=process_frame, outputs=[webcam, output_label], every=0.06)
gr.Markdown("**Security notes:** Use this demo for demonstration only. See README for production hardening tips.")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)