Spaces:

aziraarshad
/

computervision

Paused

App Files Files Community

aziraarshad commited on Dec 31, 2025

Commit

c2fa122

verified ·

1 Parent(s): b277055

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -84

app.py CHANGED Viewed

@@ -193,95 +193,101 @@ holistic = mp_holistic.Holistic(
 # ----------------------------
 # Gradio inference with state
 # ----------------------------
-def run(frame, sequence_state):
-    """
-    frame: numpy array from webcam (RGB)
-    sequence_state: list of last keypoint vectors
-    returns: annotated_frame (RGB), label dict, updated sequence_state
-    """
-    if frame is None:
-        return None, {"(waiting for camera)": 1.0}, (sequence_state or [])
-    if sequence_state is None:
-        sequence_state = []
-    frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-    image_bgr, results = mediapipe_detection(frame_bgr, holistic)
-    # ✅ draw thicker landmarks (we'll modify this next)
-    draw_styled_landmarks(image_bgr, results)
-    keypoints = extract_keypoints(results)
-    sequence_state.append(keypoints)
-    sequence_state = sequence_state[-SEQ_LEN:]
-    probs_dict = {}
-    pred_text = "Waiting..."
-    hands_present = (results.left_hand_landmarks is not None) or (results.right_hand_landmarks is not None)
-    if not hands_present:
-        pred_text = "No hands detected"
-    elif len(sequence_state) == SEQ_LEN:
-        x = torch.tensor(np.expand_dims(sequence_state, axis=0), dtype=torch.float32)
-        with torch.no_grad():
-            logits = model(x)
-            probs = torch.softmax(logits, dim=1)[0].cpu().numpy()
-        top_idx = int(np.argmax(probs))
-        conf = float(probs[top_idx])
-        pred_text = f"{LABELS[top_idx]} ({conf:.2%})"
-        # ✅ only show top-1 in probs panel (as you wanted)
-        probs_dict = {LABELS[top_idx]: conf}
-    cv2.rectangle(image_bgr, (0, 0), (640, 45), (0, 0, 0), -1)
-    cv2.putText(image_bgr, pred_text, (10, 30),
-                cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2, cv2.LINE_AA)
-    out_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
-    if not probs_dict:
-        probs_dict = {"(warming up)": 1.0}
-    return out_rgb, probs_dict, sequence_state
-    # Gradio gives RGB; MediaPipe helper expects BGR for cv2 conversions
     frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
     image_bgr, results = mediapipe_detection(frame_bgr, holistic)
     draw_styled_landmarks(image_bgr, results)
     keypoints = extract_keypoints(results)
-    sequence_state.append(keypoints)
-    sequence_state = sequence_state[-SEQ_LEN:]
-    probs_dict = {}
-    pred_text = "Waiting..."
-    conf = 0.0
     hands_present = (results.left_hand_landmarks is not None) or (results.right_hand_landmarks is not None)
     if not hands_present:
-        pred_text = "No hands detected"
-    elif len(sequence_state) == SEQ_LEN:
-        x = torch.tensor(np.expand_dims(sequence_state, axis=0), dtype=torch.float32)  # (1, 30, 258)
-        with torch.no_grad():
-            logits = model(x)
-            probs = torch.softmax(logits, dim=1)[0].cpu().numpy()
-        top_idx = int(np.argmax(probs))
-        conf = float(probs[top_idx])
-        pred_text = f"{LABELS[top_idx]} ({conf:.2%})"
-        probs_dict = {LABELS[i]: float(probs[i]) for i in range(NUM_CLASSES)}
-    # Overlay prediction text
-    cv2.rectangle(image_bgr, (0, 0), (640, 45), (245, 117, 16), -1)
     cv2.putText(
         image_bgr,
-        pred_text,
         (10, 30),
         cv2.FONT_HERSHEY_SIMPLEX,
         0.9,
@@ -290,32 +296,26 @@ def run(frame, sequence_state):
         cv2.LINE_AA
     )
-    # Back to RGB for Gradio display
     out_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
-    # If probs_dict is empty (e.g., still warming up), show something stable
-    if not probs_dict:
-        probs_dict = {"(warming up)": 1.0}
-    return out_rgb, probs_dict, sequence_state
 with gr.Blocks() as demo:
-    gr.Markdown("# Live Sign Language Gesture Demo (CNN-LSTM + Multi-Head Attention)")
-    gr.Markdown("Show your hand gesture to the webcam. Prediction starts after 30 frames are collected.")
-    seq_state = gr.State([])
-    with gr.Row():
-        cam = gr.Image(sources=["webcam"], type="numpy", label="Webcam")
-        out_img = gr.Image(type="numpy", label="Output (Annotated)")
-    out_label = gr.Label(num_top_classes=5, label="Probabilities (Top 5)")
-    # Stream events are supported in gradio 4.x
     cam.stream(
         fn=run,
-        inputs=[cam, seq_state],
-        outputs=[out_img, out_label, seq_state],
     )
 if __name__ == "__main__":

 # ----------------------------
 # Gradio inference with state
 # ----------------------------
+# Make landmarks thicker & clearer
+HAND_LANDMARK_STYLE = mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=4, circle_radius=6)
+HAND_CONNECTION_STYLE = mp_drawing.DrawingSpec(color=(255, 0, 0), thickness=4)
+POSE_LANDMARK_STYLE = mp_drawing.DrawingSpec(color=(0, 255, 255), thickness=3, circle_radius=5)
+POSE_CONNECTION_STYLE = mp_drawing.DrawingSpec(color=(0, 128, 255), thickness=3)
+def draw_styled_landmarks(image, results):
+    # Pose (optional; comment out if you want faster)
+    if results.pose_landmarks:
+        mp_drawing.draw_landmarks(
+            image,
+            results.pose_landmarks,
+            mp_holistic.POSE_CONNECTIONS,
+            POSE_LANDMARK_STYLE,
+            POSE_CONNECTION_STYLE,
+        )
+    # Hands
+    if results.left_hand_landmarks:
+        mp_drawing.draw_landmarks(
+            image,
+            results.left_hand_landmarks,
+            mp_holistic.HAND_CONNECTIONS,
+            HAND_LANDMARK_STYLE,
+            HAND_CONNECTION_STYLE,
+        )
+    if results.right_hand_landmarks:
+        mp_drawing.draw_landmarks(
+            image,
+            results.right_hand_landmarks,
+            mp_holistic.HAND_CONNECTIONS,
+            HAND_LANDMARK_STYLE,
+            HAND_CONNECTION_STYLE,
+        )
+def run(frame, state):
+    """
+    One-screen mode:
+    - input: webcam frame
+    - output: the same frame with tracking overlay (and optional top-1 text)
+    state: dict that keeps sequence + last text + frame counter
+    """
+    if frame is None:
+        return None, state
+    if state is None:
+        state = {"seq": [], "t": 0, "text": "Warming up..."}
+    state["t"] += 1
+    # Gradio gives RGB; MediaPipe expects RGB internally but our helper uses cv2 BGR conversions
     frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+    # OPTIONAL speed boost: smaller frame (uncomment if needed)
+    # frame_bgr = cv2.resize(frame_bgr, (640, 360))
     image_bgr, results = mediapipe_detection(frame_bgr, holistic)
+    # Draw overlay landmarks on the SAME frame
     draw_styled_landmarks(image_bgr, results)
+    # Build sequence for your model
     keypoints = extract_keypoints(results)
+    state["seq"].append(keypoints)
+    state["seq"] = state["seq"][-SEQ_LEN:]
     hands_present = (results.left_hand_landmarks is not None) or (results.right_hand_landmarks is not None)
     if not hands_present:
+        state["text"] = "No hands detected"
+    elif len(state["seq"]) < SEQ_LEN:
+        state["text"] = f"Warming up... {len(state['seq'])}/{SEQ_LEN}"
+    else:
+        # Run model less frequently to reduce lag on CPU
+        PRED_EVERY = 3
+        if state["t"] % PRED_EVERY == 0:
+            x = torch.tensor(np.expand_dims(state["seq"], axis=0), dtype=torch.float32)
+            with torch.no_grad():
+                logits = model(x)
+                probs = torch.softmax(logits, dim=1)[0].cpu().numpy()
+            top_idx = int(np.argmax(probs))
+            conf = float(probs[top_idx])
+            state["text"] = f"{LABELS[top_idx]} ({conf:.0%})"
+        # else: keep last state["text"] so it looks stable
+    # Optional: overlay top-1 text on the same screen
+    cv2.rectangle(image_bgr, (0, 0), (640, 45), (0, 0, 0), -1)
     cv2.putText(
         image_bgr,
+        state["text"],
         (10, 30),
         cv2.FONT_HERSHEY_SIMPLEX,
         0.9,
         cv2.LINE_AA
     )
     out_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
+    return out_rgb, state
+# ----------------------------
+# One-screen Gradio UI
+# ----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# Live Hand Gesture Tracking (Single Screen)")
+    gr.Markdown("Webcam shows the tracking overlay directly. Prediction text appears on the same screen.")
+    st = gr.State(None)
+    cam = gr.Image(sources=["webcam"], type="numpy", label="Webcam (Overlay)")
+    # Stream: output back into the SAME webcam component
     cam.stream(
         fn=run,
+        inputs=[cam, st],
+        outputs=[cam, st],
     )
 if __name__ == "__main__":