Spaces:

aziraarshad
/

computervision

Paused

App Files Files Community

aziraarshad commited on Dec 31, 2025

Commit

697291c

verified ·

1 Parent(s): 838b69e

Update app.py

Browse files

Files changed (1) hide show

app.py +274 -274

app.py CHANGED Viewed

@@ -1,274 +1,274 @@
-import json
-import numpy as np
-import cv2
-import gradio as gr
-import torch
-import torch.nn as nn
-import mediapipe as mp
-# ----------------------------
-# Load labels (labels.json)
-# Supports:
-# 1) ["label1","label2",...]
-# 2) {"0":"label1","1":"label2",...}
-# ----------------------------
-def load_labels(path="labels.json"):
-    with open(path, "r", encoding="utf-8") as f:
-        obj = json.load(f)
-    if isinstance(obj, list):
-        return obj
-    if isinstance(obj, dict):
-        items = sorted(obj.items(), key=lambda kv: int(kv[0]))
-        return [v for _, v in items]
-    raise ValueError("labels.json must be a list or a dict mapping index -> label.")
-LABELS = load_labels("labels.json")
-NUM_CLASSES = len(LABELS)
-# ----------------------------
-# MediaPipe helpers (from your notebook)
-# ----------------------------
-mp_holistic = mp.solutions.holistic
-mp_drawing = mp.solutions.drawing_utils
-def mediapipe_detection(image, model):
-    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    image.flags.writeable = False
-    results = model.process(image)
-    image.flags.writeable = True
-    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
-    return image, results
-def draw_styled_landmarks(image, results):
-    mp_drawing.draw_landmarks(
-        image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
-        mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=1, circle_radius=1),
-        mp_drawing.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1)
-    )
-    mp_drawing.draw_landmarks(
-        image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
-        mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=1, circle_radius=2),
-        mp_drawing.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1)
-    )
-    mp_drawing.draw_landmarks(
-        image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
-        mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=1, circle_radius=2),
-        mp_drawing.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1)
-    )
-def extract_keypoints(results):
-    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() \
-        if results.pose_landmarks else np.zeros(33 * 4)
-    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() \
-        if results.left_hand_landmarks else np.zeros(21 * 3)
-    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() \
-        if results.right_hand_landmarks else np.zeros(21 * 3)
-    return np.concatenate([pose, lh, rh])  # 258 dims
-# ----------------------------
-# Model code (from your notebook)
-# ----------------------------
-class MultiHeadSelfAttention(nn.Module):
-    def __init__(self, embed_dim, num_heads=8, dropout=0.1):
-        super().__init__()
-        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.head_dim = embed_dim // num_heads
-        self.query = nn.Linear(embed_dim, embed_dim)
-        self.key = nn.Linear(embed_dim, embed_dim)
-        self.value = nn.Linear(embed_dim, embed_dim)
-        self.dropout = nn.Dropout(dropout)
-        self.out_proj = nn.Linear(embed_dim, embed_dim)
-        self.norm = nn.LayerNorm(embed_dim)
-    def forward(self, x):
-        batch_size, seq_len, _ = x.size()
-        residual = x
-        Q = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
-        K = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
-        V = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
-        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)
-        attn_weights = torch.softmax(scores, dim=-1)
-        attn_weights = self.dropout(attn_weights)
-        attn_output = torch.matmul(attn_weights, V)
-        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim)
-        output = self.out_proj(attn_output)
-        output = self.norm(output + residual)
-        return output, attn_weights
-class AttentionEnhancedLSTM(nn.Module):
-    def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=True, dropout=0.1):
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.bidirectional = bidirectional
-        self.lstm = nn.LSTM(
-            input_size, hidden_size, num_layers,
-            batch_first=True, bidirectional=bidirectional,
-            dropout=dropout if num_layers > 1 else 0
-        )
-        lstm_output_dim = hidden_size * 2 if bidirectional else hidden_size
-        self.attention = MultiHeadSelfAttention(embed_dim=lstm_output_dim, num_heads=8, dropout=dropout)
-    def forward(self, x):
-        lstm_out, (h_n, c_n) = self.lstm(x)
-        attn_out, attn_weights = self.attention(lstm_out)
-        return attn_out, (h_n, c_n), attn_weights
-class CNNLSTMAttention(nn.Module):
-    def __init__(self, input_size, num_classes, dropout=0.4, num_attention_heads=8):
-        super().__init__()
-        self.conv1 = nn.Conv1d(in_channels=input_size, out_channels=128, kernel_size=3, padding=1)
-        self.bn1 = nn.BatchNorm1d(128)
-        self.conv2 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
-        self.bn2 = nn.BatchNorm1d(256)
-        self.conv3 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1)
-        self.bn3 = nn.BatchNorm1d(128)
-        self.dropout_cnn = nn.Dropout(dropout)
-        self.ae_lstm1 = AttentionEnhancedLSTM(128, 256, num_layers=1, bidirectional=True, dropout=dropout)
-        self.ae_lstm2 = AttentionEnhancedLSTM(512, 128, num_layers=1, bidirectional=True, dropout=dropout)
-        self.dropout_lstm = nn.Dropout(dropout)
-        self.temporal_attention = MultiHeadSelfAttention(embed_dim=256, num_heads=num_attention_heads, dropout=dropout)
-        self.attention_pool = nn.Linear(256, 1)
-        self.fc1 = nn.Linear(256, 128)
-        self.bn_fc = nn.BatchNorm1d(128)
-        self.fc2 = nn.Linear(128, 64)
-        self.dropout_fc = nn.Dropout(dropout)
-        self.output_layer = nn.Linear(64, num_classes)
-    def forward(self, x):
-        # x: (batch, seq_len, features=258)
-        x = x.permute(0, 2, 1)  # (batch, features, seq_len)
-        x = torch.relu(self.bn1(self.conv1(x)))
-        x = self.dropout_cnn(x)
-        x = torch.relu(self.bn2(self.conv2(x)))
-        x = self.dropout_cnn(x)
-        x = torch.relu(self.bn3(self.conv3(x)))
-        x = self.dropout_cnn(x)
-        x = x.permute(0, 2, 1)  # (batch, seq_len, channels=128)
-        x, _, _ = self.ae_lstm1(x)  # -> (batch, seq_len, 512)
-        x = self.dropout_lstm(x)
-        x, _, _ = self.ae_lstm2(x)  # -> (batch, seq_len, 256)
-        x = self.dropout_lstm(x)
-        attn_output, _ = self.temporal_attention(x)  # (batch, seq_len, 256)
-        attention_scores = torch.softmax(self.attention_pool(attn_output), dim=1)  # (batch, seq_len, 1)
-        pooled_output = torch.sum(attention_scores * attn_output, dim=1)  # (batch, 256)
-        x = torch.relu(self.bn_fc(self.fc1(pooled_output)))
-        x = self.dropout_fc(x)
-        x = torch.relu(self.fc2(x))
-        x = self.dropout_fc(x)
-        x = self.output_layer(x)
-        return x
-# ----------------------------
-# Load trained weights
-# ----------------------------
-DEVICE = "cpu"
-INPUT_SIZE = 258
-SEQ_LEN = 30
-model = CNNLSTMAttention(INPUT_SIZE, NUM_CLASSES, dropout=0.4, num_attention_heads=8)
-state = torch.load("trained_model.pth", map_location=DEVICE)
-model.load_state_dict(state, strict=True)
-model.eval()
-# One MediaPipe instance for the whole app (faster)
-holistic = mp_holistic.Holistic(
-    min_detection_confidence=0.5,
-    min_tracking_confidence=0.5
-)
-# ----------------------------
-# Gradio inference with state
-# ----------------------------
-def run(frame, sequence_state):
-    """
-    frame: numpy array from webcam (RGB)
-    sequence_state: list of last keypoint vectors
-    returns: annotated_frame (RGB), label dict, updated sequence_state
-    """
-    if sequence_state is None:
-        sequence_state = []
-    # Gradio gives RGB; MediaPipe helper expects BGR for cv2 conversions
-    frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-    image_bgr, results = mediapipe_detection(frame_bgr, holistic)
-    draw_styled_landmarks(image_bgr, results)
-    keypoints = extract_keypoints(results)
-    sequence_state.append(keypoints)
-    sequence_state = sequence_state[-SEQ_LEN:]
-    probs_dict = {}
-    pred_text = "Waiting..."
-    conf = 0.0
-    hands_present = (results.left_hand_landmarks is not None) or (results.right_hand_landmarks is not None)
-    if not hands_present:
-        pred_text = "No hands detected"
-    elif len(sequence_state) == SEQ_LEN:
-        x = torch.tensor(np.expand_dims(sequence_state, axis=0), dtype=torch.float32)  # (1, 30, 258)
-        with torch.no_grad():
-            logits = model(x)
-            probs = torch.softmax(logits, dim=1)[0].cpu().numpy()
-        top_idx = int(np.argmax(probs))
-        conf = float(probs[top_idx])
-        pred_text = f"{LABELS[top_idx]} ({conf:.2%})"
-        probs_dict = {LABELS[i]: float(probs[i]) for i in range(NUM_CLASSES)}
-    # Overlay prediction text
-    cv2.rectangle(image_bgr, (0, 0), (640, 45), (245, 117, 16), -1)
-    cv2.putText(
-        image_bgr,
-        pred_text,
-        (10, 30),
-        cv2.FONT_HERSHEY_SIMPLEX,
-        0.9,
-        (255, 255, 255),
-        2,
-        cv2.LINE_AA
-    )
-    # Back to RGB for Gradio display
-    out_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
-    # If probs_dict is empty (e.g., still warming up), show something stable
-    if not probs_dict:
-        probs_dict = {"(warming up)": 1.0}
-    return out_rgb, probs_dict, sequence_state
-with gr.Blocks() as demo:
-    gr.Markdown("# Live Sign Language Gesture Demo (CNN-LSTM + Multi-Head Attention)")
-    gr.Markdown("Show your hand gesture to the webcam. Prediction starts after 30 frames are collected.")
-    seq_state = gr.State([])
-    with gr.Row():
-        cam = gr.Image(source="webcam", streaming=True, type="numpy", label="Webcam")
-        out_img = gr.Image(type="numpy", label="Output (Annotated)")
-    out_label = gr.Label(num_top_classes=5, label="Probabilities (Top 5)")
-    cam.stream(
-        fn=run,
-        inputs=[cam, seq_state],
-        outputs=[out_img, out_label, seq_state],
-    )
-if __name__ == "__main__":
-    demo.launch()

+import json
+import numpy as np
+import cv2
+import gradio as gr
+import torch
+import torch.nn as nn
+import mediapipe as mp
+# ----------------------------
+# Load labels (labels.json)
+# Supports:
+# 1) ["label1","label2",...]
+# 2) {"0":"label1","1":"label2",...}
+# ----------------------------
+def load_labels(path="labels.json"):
+    with open(path, "r", encoding="utf-8") as f:
+        obj = json.load(f)
+    if isinstance(obj, list):
+        return obj
+    if isinstance(obj, dict):
+        items = sorted(obj.items(), key=lambda kv: int(kv[0]))
+        return [v for _, v in items]
+    raise ValueError("labels.json must be a list or a dict mapping index -> label.")
+LABELS = load_labels("labels.json")
+NUM_CLASSES = len(LABELS)
+# ----------------------------
+# MediaPipe helpers (from your notebook)
+# ----------------------------
+mp_holistic = mp.solutions.holistic
+mp_drawing = mp.solutions.drawing_utils
+def mediapipe_detection(image, model):
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    image.flags.writeable = False
+    results = model.process(image)
+    image.flags.writeable = True
+    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+    return image, results
+def draw_styled_landmarks(image, results):
+    mp_drawing.draw_landmarks(
+        image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
+        mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=1, circle_radius=1),
+        mp_drawing.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1)
+    )
+    mp_drawing.draw_landmarks(
+        image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
+        mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=1, circle_radius=2),
+        mp_drawing.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1)
+    )
+    mp_drawing.draw_landmarks(
+        image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
+        mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=1, circle_radius=2),
+        mp_drawing.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1)
+    )
+def extract_keypoints(results):
+    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() \
+        if results.pose_landmarks else np.zeros(33 * 4)
+    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() \
+        if results.left_hand_landmarks else np.zeros(21 * 3)
+    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() \
+        if results.right_hand_landmarks else np.zeros(21 * 3)
+    return np.concatenate([pose, lh, rh])  # 258 dims
+# ----------------------------
+# Model code (from your notebook)
+# ----------------------------
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads=8, dropout=0.1):
+        super().__init__()
+        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.query = nn.Linear(embed_dim, embed_dim)
+        self.key = nn.Linear(embed_dim, embed_dim)
+        self.value = nn.Linear(embed_dim, embed_dim)
+        self.dropout = nn.Dropout(dropout)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.norm = nn.LayerNorm(embed_dim)
+    def forward(self, x):
+        batch_size, seq_len, _ = x.size()
+        residual = x
+        Q = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        K = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        V = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)
+        attn_weights = torch.softmax(scores, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+        attn_output = torch.matmul(attn_weights, V)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim)
+        output = self.out_proj(attn_output)
+        output = self.norm(output + residual)
+        return output, attn_weights
+class AttentionEnhancedLSTM(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=True, dropout=0.1):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bidirectional = bidirectional
+        self.lstm = nn.LSTM(
+            input_size, hidden_size, num_layers,
+            batch_first=True, bidirectional=bidirectional,
+            dropout=dropout if num_layers > 1 else 0
+        )
+        lstm_output_dim = hidden_size * 2 if bidirectional else hidden_size
+        self.attention = MultiHeadSelfAttention(embed_dim=lstm_output_dim, num_heads=8, dropout=dropout)
+    def forward(self, x):
+        lstm_out, (h_n, c_n) = self.lstm(x)
+        attn_out, attn_weights = self.attention(lstm_out)
+        return attn_out, (h_n, c_n), attn_weights
+class CNNLSTMAttention(nn.Module):
+    def __init__(self, input_size, num_classes, dropout=0.4, num_attention_heads=8):
+        super().__init__()
+        self.conv1 = nn.Conv1d(in_channels=input_size, out_channels=128, kernel_size=3, padding=1)
+        self.bn1 = nn.BatchNorm1d(128)
+        self.conv2 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
+        self.bn2 = nn.BatchNorm1d(256)
+        self.conv3 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1)
+        self.bn3 = nn.BatchNorm1d(128)
+        self.dropout_cnn = nn.Dropout(dropout)
+        self.ae_lstm1 = AttentionEnhancedLSTM(128, 256, num_layers=1, bidirectional=True, dropout=dropout)
+        self.ae_lstm2 = AttentionEnhancedLSTM(512, 128, num_layers=1, bidirectional=True, dropout=dropout)
+        self.dropout_lstm = nn.Dropout(dropout)
+        self.temporal_attention = MultiHeadSelfAttention(embed_dim=256, num_heads=num_attention_heads, dropout=dropout)
+        self.attention_pool = nn.Linear(256, 1)
+        self.fc1 = nn.Linear(256, 128)
+        self.bn_fc = nn.BatchNorm1d(128)
+        self.fc2 = nn.Linear(128, 64)
+        self.dropout_fc = nn.Dropout(dropout)
+        self.output_layer = nn.Linear(64, num_classes)
+    def forward(self, x):
+        # x: (batch, seq_len, features=258)
+        x = x.permute(0, 2, 1)  # (batch, features, seq_len)
+        x = torch.relu(self.bn1(self.conv1(x)))
+        x = self.dropout_cnn(x)
+        x = torch.relu(self.bn2(self.conv2(x)))
+        x = self.dropout_cnn(x)
+        x = torch.relu(self.bn3(self.conv3(x)))
+        x = self.dropout_cnn(x)
+        x = x.permute(0, 2, 1)  # (batch, seq_len, channels=128)
+        x, _, _ = self.ae_lstm1(x)  # -> (batch, seq_len, 512)
+        x = self.dropout_lstm(x)
+        x, _, _ = self.ae_lstm2(x)  # -> (batch, seq_len, 256)
+        x = self.dropout_lstm(x)
+        attn_output, _ = self.temporal_attention(x)  # (batch, seq_len, 256)
+        attention_scores = torch.softmax(self.attention_pool(attn_output), dim=1)  # (batch, seq_len, 1)
+        pooled_output = torch.sum(attention_scores * attn_output, dim=1)  # (batch, 256)
+        x = torch.relu(self.bn_fc(self.fc1(pooled_output)))
+        x = self.dropout_fc(x)
+        x = torch.relu(self.fc2(x))
+        x = self.dropout_fc(x)
+        x = self.output_layer(x)
+        return x
+# ----------------------------
+# Load trained weights
+# ----------------------------
+DEVICE = "cpu"
+INPUT_SIZE = 258
+SEQ_LEN = 30
+model = CNNLSTMAttention(INPUT_SIZE, NUM_CLASSES, dropout=0.4, num_attention_heads=8)
+state = torch.load("trained_model.pth", map_location=DEVICE)
+model.load_state_dict(state, strict=True)
+model.eval()
+# One MediaPipe instance for the whole app (faster)
+holistic = mp_holistic.Holistic(
+    min_detection_confidence=0.5,
+    min_tracking_confidence=0.5
+)
+# ----------------------------
+# Gradio inference with state
+# ----------------------------
+def run(frame, sequence_state):
+    """
+    frame: numpy array from webcam (RGB)
+    sequence_state: list of last keypoint vectors
+    returns: annotated_frame (RGB), label dict, updated sequence_state
+    """
+    if sequence_state is None:
+        sequence_state = []
+    # Gradio gives RGB; MediaPipe helper expects BGR for cv2 conversions
+    frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+    image_bgr, results = mediapipe_detection(frame_bgr, holistic)
+    draw_styled_landmarks(image_bgr, results)
+    keypoints = extract_keypoints(results)
+    sequence_state.append(keypoints)
+    sequence_state = sequence_state[-SEQ_LEN:]
+    probs_dict = {}
+    pred_text = "Waiting..."
+    conf = 0.0
+    hands_present = (results.left_hand_landmarks is not None) or (results.right_hand_landmarks is not None)
+    if not hands_present:
+        pred_text = "No hands detected"
+    elif len(sequence_state) == SEQ_LEN:
+        x = torch.tensor(np.expand_dims(sequence_state, axis=0), dtype=torch.float32)  # (1, 30, 258)
+        with torch.no_grad():
+            logits = model(x)
+            probs = torch.softmax(logits, dim=1)[0].cpu().numpy()
+        top_idx = int(np.argmax(probs))
+        conf = float(probs[top_idx])
+        pred_text = f"{LABELS[top_idx]} ({conf:.2%})"
+        probs_dict = {LABELS[i]: float(probs[i]) for i in range(NUM_CLASSES)}
+    # Overlay prediction text
+    cv2.rectangle(image_bgr, (0, 0), (640, 45), (245, 117, 16), -1)
+    cv2.putText(
+        image_bgr,
+        pred_text,
+        (10, 30),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.9,
+        (255, 255, 255),
+        2,
+        cv2.LINE_AA
+    )
+    # Back to RGB for Gradio display
+    out_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
+    # If probs_dict is empty (e.g., still warming up), show something stable
+    if not probs_dict:
+        probs_dict = {"(warming up)": 1.0}
+    return out_rgb, probs_dict, sequence_state
+with gr.Blocks() as demo:
+    gr.Markdown("# Live Sign Language Gesture Demo (CNN-LSTM + Multi-Head Attention)")
+    gr.Markdown("Show your hand gesture to the webcam. Prediction starts after 30 frames are collected.")
+    seq_state = gr.State([])
+    with gr.Row():
+        cam = gr.Webcam(streaming=True, label="Webcam")
+        out_img = gr.Image(type="numpy", label="Output (Annotated)")
+    out_label = gr.Label(num_top_classes=5, label="Probabilities (Top 5)")
+    cam.stream(
+        fn=run,
+        inputs=[cam, seq_state],
+        outputs=[out_img, out_label, seq_state],
+    )
+if __name__ == "__main__":
+    demo.launch()