# app.py
import gradio as gr
import mediapipe as mp
import numpy as np
import joblib
import tensorflow as tf
from collections import deque
import cv2

SEQ_LEN = 30
MODEL_PATH = "gesture_lstm.h5"
LABELS_PATH = "labels.joblib"

mp_hands = mp.solutions.hands

# load model and labels
model = tf.keras.models.load_model(MODEL_PATH)
le = joblib.load(LABELS_PATH)

# buffer to hold sequence of vectors (global, per app instance)
buffer = deque(maxlen=SEQ_LEN)

def extract_landmarks_from_image(img):
    # img is in RGB (gradio webcam returns RGB)
    with mp_hands.Hands(static_image_mode=False,
                        max_num_hands=1,
                        min_detection_confidence=0.5,
                        min_tracking_confidence=0.5) as hands:
        res = hands.process(img)
        if res.multi_hand_landmarks:
            lm = res.multi_hand_landmarks[0]
            vec = []
            for p in lm.landmark:
                vec.extend([p.x, p.y, p.z])
            return np.array(vec, dtype=np.float32), res
        else:
            return np.zeros(21*3, dtype=np.float32), res

# We will keep a persistent mediapipe Hands object across calls for speed:
mp_hands_persistent = mp.solutions.hands.Hands(static_image_mode=False,
                                               max_num_hands=1,
                                               min_detection_confidence=0.5,
                                               min_tracking_confidence=0.5)

def predict_frame(frame):
    # frame: numpy array RGB from gradio webcam
    # returns image with overlay and predicted label + confidence
    global buffer, mp_hands_persistent
    image = frame.copy()
    # MediaPipe expects RGB — gradio already gives RGB
    res = mp_hands_persistent.process(image)
    if res.multi_hand_landmarks:
        lm = res.multi_hand_landmarks[0]
        vec = []
        for p in lm.landmark:
            vec.extend([p.x, p.y, p.z])
        vec = np.array(vec, dtype=np.float32)
        # draw landmarks on image (convert to BGR for cv2 drawing)
        img_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        mp.solutions.drawing_utils.draw_landmarks(img_bgr, lm, mp_hands.HAND_CONNECTIONS)
        image = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    else:
        vec = np.zeros(21*3, dtype=np.float32)
    buffer.append(vec)
    label_text = "No prediction (buffering...)" 
    confidence = 0.0
    if len(buffer) == SEQ_LEN:
        seq = np.stack(buffer, axis=0)  # (seq_len, features)
        # normalize sample
        mean = seq.mean(axis=0)
        std = seq.std(axis=0) + 1e-8
        seq = (seq - mean) / std
        seq = np.expand_dims(seq, axis=0)  # (1, seq_len, features)
        probs = model.predict(seq, verbose=0)[0]
        idx = np.argmax(probs)
        label = le.inverse_transform([idx])[0]
        confidence = float(probs[idx])
        label_text = f"{label} ({confidence*100:.1f}%)"
    return image, label_text

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## Air Hacking / Security Gesture Simulator\nPoint your webcam and perform a stored gesture. The model predicts after it has collected enough frames.")
    with gr.Row():
        webcam = gr.Image(source="webcam", streaming=True, tool="none", type="numpy")
        output_label = gr.Textbox(label="Prediction")
    def process_frame(frame):
        img, label = predict_frame(frame)
        return img, label
    webcam.stream(fn=process_frame, outputs=[webcam, output_label], every=0.06)
    gr.Markdown("**Security notes:** Use this demo for demonstration only. See README for production hardening tips.")

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)