Spaces:

TiH0
/

inara

Sleeping

App Files Files Community

TiH0 commited on Nov 6, 2025

Commit

ec5d10e

verified ·

1 Parent(s): 05db39a

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -84

app.py CHANGED Viewed

@@ -1,30 +1,54 @@
 import gradio as gr
 import pickle
 import cv2
 import mediapipe as mp
 import numpy as np
 from PIL import Image
 import warnings
 # Suppress sklearn version warnings
 warnings.filterwarnings('ignore', category=UserWarning)
-# Load the model with error handling
 try:
-    with open('./model.p', 'rb') as f:
-        model_dict = pickle.load(f)
-    model = model_dict['model']
-    print("Model loaded successfully!")
 except Exception as e:
-    print(f"Error loading model: {e}")
     raise
 mp_hands = mp.solutions.hands
 mp_drawing = mp.solutions.drawing_utils
 mp_drawing_styles = mp.solutions.drawing_styles
-# Initialize hand detection
-hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)
 labels_dict = {
     0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I',
@@ -32,31 +56,78 @@ labels_dict = {
     18: 'R', 19: 'S', 20: 'space', 21: 'T', 22: 'U', 23: 'V', 24: 'W', 25: 'X', 26: 'Y', 27: 'Z'
 }
-def predict_sign(image):
-    """Process image and predict sign language character"""
-    # Convert PIL Image to numpy array
-    frame = np.array(image)
-    # Convert RGB to BGR for OpenCV
-    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-    H, W, _ = frame.shape
-    # Convert back to RGB for MediaPipe
-    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-    # Process the frame with MediaPipe
-    results = hands.process(frame_rgb)
-    predicted_character = "No hand detected"
-    if results.multi_hand_landmarks:
-        data_aux = []
-        x_all, y_all = [], []
-        if len(results.multi_hand_landmarks) == 2:  # Two-hand sign
-            for hand_landmarks in results.multi_hand_landmarks:
                 x_, y_ = [], []
                 for i in range(len(hand_landmarks.landmark)):
@@ -72,6 +143,9 @@ def predict_sign(image):
                     data_aux.append(hand_landmarks.landmark[i].x - min(x_))
                     data_aux.append(hand_landmarks.landmark[i].y - min(y_))
                 # Draw hand landmarks
                 mp_drawing.draw_landmarks(
                     frame,
@@ -80,71 +154,109 @@ def predict_sign(image):
                     mp_drawing_styles.get_default_hand_landmarks_style(),
                     mp_drawing_styles.get_default_hand_connections_style()
                 )
-        elif len(results.multi_hand_landmarks) == 1:  # One-hand sign
-            hand_landmarks = results.multi_hand_landmarks[0]
-            x_, y_ = [], []
-            for i in range(len(hand_landmarks.landmark)):
-                x = hand_landmarks.landmark[i].x
-                y = hand_landmarks.landmark[i].y
-                x_.append(x)
-                y_.append(y)
-            x_all.extend(x_)
-            y_all.extend(y_)
-            for i in range(len(hand_landmarks.landmark)):
-                data_aux.append(hand_landmarks.landmark[i].x - min(x_))
-                data_aux.append(hand_landmarks.landmark[i].y - min(y_))
-            # Pad with zeros to match two-hand format
-            data_aux.extend([0] * (84 - len(data_aux)))
-            # Draw hand landmarks
-            mp_drawing.draw_landmarks(
-                frame,
-                hand_landmarks,
-                mp_hands.HAND_CONNECTIONS,
-                mp_drawing_styles.get_default_hand_landmarks_style(),
-                mp_drawing_styles.get_default_hand_connections_style()
-            )
-        # Convert to NumPy array and predict
-        try:
-            prediction = model.predict([np.asarray(data_aux)])
-            predicted_character = labels_dict.get(prediction[0], str(prediction[0]))
-        except Exception as e:
-            predicted_character = f"Error: {str(e)}"
-        # Draw the bounding box and prediction
-        x1 = int(min(x_all) * W) - 10
-        y1 = int(min(y_all) * H) - 10
-        x2 = int(max(x_all) * W) + 10
-        y2 = int(max(y_all) * H) + 10
-        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 4)
-        cv2.putText(frame, predicted_character, (x1, y1 - 10),
-                   cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 255, 0), 3, cv2.LINE_AA)
-    # Convert BGR back to RGB for display
-    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-    return frame, predicted_character
-# Create Gradio interface
-demo = gr.Interface(
-    fn=predict_sign,
-    inputs=gr.Image(sources=["webcam", "upload"], type="pil", label="Show your sign"),
-    outputs=[
-        gr.Image(label="Detected Sign"),
-        gr.Textbox(label="Predicted Character")
-    ],
-    title="Sign Language Recognition",
-    description="Show a sign language gesture to the camera or upload an image. The model will detect and classify the sign.",
-    examples=None,
-    live=True  # Enable real-time prediction with webcam
-)
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import pickle
+import joblib
 import cv2
 import mediapipe as mp
 import numpy as np
 from PIL import Image
 import warnings
+import os
 # Suppress sklearn version warnings
 warnings.filterwarnings('ignore', category=UserWarning)
+# Load the model with multiple fallback options
+def load_model():
+    """Try loading model from different formats"""
+    if os.path.exists('./model.joblib'):
+        print("Loading model from joblib...")
+        return joblib.load('./model.joblib')
+    elif os.path.exists('./model_v2.p'):
+        print("Loading model from model_v2.p...")
+        with open('./model_v2.p', 'rb') as f:
+            model_dict = pickle.load(f)
+        return model_dict['model']
+    elif os.path.exists('./model.p'):
+        print("Loading model from model.p...")
+        with open('./model.p', 'rb') as f:
+            model_dict = pickle.load(f)
+        return model_dict['model']
+    else:
+        raise FileNotFoundError("No model file found!")
 try:
+    model = load_model()
+    print("✓ Model loaded successfully!")
 except Exception as e:
+    print(f"✗ Error loading model: {e}")
     raise
 mp_hands = mp.solutions.hands
 mp_drawing = mp.solutions.drawing_utils
 mp_drawing_styles = mp.solutions.drawing_styles
+# Initialize hand detection - optimized for speed
+hands = mp_hands.Hands(
+    static_image_mode=False,  # False for video/real-time
+    max_num_hands=2,
+    min_detection_confidence=0.5,
+    min_tracking_confidence=0.5
+)
 labels_dict = {
     0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I',
     18: 'R', 19: 'S', 20: 'space', 21: 'T', 22: 'U', 23: 'V', 24: 'W', 25: 'X', 26: 'Y', 27: 'Z'
 }
+# Store history for smoothing predictions
+prediction_history = []
+HISTORY_SIZE = 5
+def smooth_prediction(new_pred):
+    """Smooth predictions to reduce jitter"""
+    global prediction_history
+    prediction_history.append(new_pred)
+    if len(prediction_history) > HISTORY_SIZE:
+        prediction_history.pop(0)
+    # Return most common prediction
+    if prediction_history:
+        return max(set(prediction_history), key=prediction_history.count)
+    return new_pred
+def predict_sign_realtime(image):
+    """Process image and predict sign language character in real-time"""
+    if image is None:
+        return None, "No image provided", ""
+    try:
+        # Convert PIL Image to numpy array
+        frame = np.array(image)
+        # Convert RGB to BGR for OpenCV
+        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+        H, W, _ = frame.shape
+        # Convert back to RGB for MediaPipe
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        # Process the frame with MediaPipe
+        results = hands.process(frame_rgb)
+        predicted_character = "No hand detected"
+        confidence_text = ""
+        if results.multi_hand_landmarks:
+            data_aux = []
+            x_all, y_all = [], []
+            if len(results.multi_hand_landmarks) == 2:  # Two-hand sign
+                for hand_landmarks in results.multi_hand_landmarks:
+                    x_, y_ = [], []
+                    for i in range(len(hand_landmarks.landmark)):
+                        x = hand_landmarks.landmark[i].x
+                        y = hand_landmarks.landmark[i].y
+                        x_.append(x)
+                        y_.append(y)
+                    x_all.extend(x_)
+                    y_all.extend(y_)
+                    for i in range(len(hand_landmarks.landmark)):
+                        data_aux.append(hand_landmarks.landmark[i].x - min(x_))
+                        data_aux.append(hand_landmarks.landmark[i].y - min(y_))
+                    # Draw hand landmarks
+                    mp_drawing.draw_landmarks(
+                        frame,
+                        hand_landmarks,
+                        mp_hands.HAND_CONNECTIONS,
+                        mp_drawing_styles.get_default_hand_landmarks_style(),
+                        mp_drawing_styles.get_default_hand_connections_style()
+                    )
+            elif len(results.multi_hand_landmarks) == 1:  # One-hand sign
+                hand_landmarks = results.multi_hand_landmarks[0]
                 x_, y_ = [], []
                 for i in range(len(hand_landmarks.landmark)):
                     data_aux.append(hand_landmarks.landmark[i].x - min(x_))
                     data_aux.append(hand_landmarks.landmark[i].y - min(y_))
+                # Pad with zeros to match two-hand format
+                data_aux.extend([0] * (84 - len(data_aux)))
                 # Draw hand landmarks
                 mp_drawing.draw_landmarks(
                     frame,
                     mp_drawing_styles.get_default_hand_landmarks_style(),
                     mp_drawing_styles.get_default_hand_connections_style()
                 )
+            # Convert to NumPy array and predict
+            try:
+                prediction = model.predict([np.asarray(data_aux)])
+                raw_pred = labels_dict.get(prediction[0], str(prediction[0]))
+                # Smooth prediction
+                predicted_character = smooth_prediction(raw_pred)
+                # Get confidence if available
+                if hasattr(model, 'predict_proba'):
+                    proba = model.predict_proba([np.asarray(data_aux)])
+                    confidence = np.max(proba) * 100
+                    confidence_text = f"Confidence: {confidence:.1f}%"
+            except Exception as e:
+                predicted_character = f"Error: {str(e)}"
+                print(f"Prediction error: {e}")
+            # Draw the bounding box and prediction
+            x1 = int(min(x_all) * W) - 10
+            y1 = int(min(y_all) * H) - 10
+            x2 = int(max(x_all) * W) + 10
+            y2 = int(max(y_all) * H) + 10
+            # Draw bounding box
+            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 3)
+            # Draw prediction text with background
+            text = predicted_character
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            font_scale = 1.5
+            thickness = 3
+            # Get text size for background
+            (text_width, text_height), baseline = cv2.getTextSize(text, font, font_scale, thickness)
+            # Draw black background for text
+            cv2.rectangle(frame, (x1, y1 - text_height - 20), (x1 + text_width + 10, y1), (0, 0, 0), -1)
+            # Draw text
+            cv2.putText(frame, text, (x1 + 5, y1 - 10), font, font_scale, (0, 255, 0), thickness, cv2.LINE_AA)
+        # Convert BGR back to RGB for display
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        return frame, predicted_character, confidence_text
+    except Exception as e:
+        print(f"Error in predict_sign: {e}")
+        return None, f"Error: {str(e)}", ""
+# Create Gradio interface with real-time streaming
+with gr.Blocks(title="Sign Language Recognition") as demo:
+    gr.Markdown(
+        """
+        # 🤟 Real-Time Sign Language Recognition
+        Show your sign language gesture to the camera for real-time detection!
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(
+                sources=["webcam"],
+                type="pil",
+                label="Webcam Feed",
+                streaming=True  # Enable streaming for real-time
+            )
+        with gr.Column():
+            output_image = gr.Image(label="Detected Sign")
+            predicted_text = gr.Textbox(
+                label="Predicted Character",
+                scale=1,
+                lines=1
+            )
+            confidence_text = gr.Textbox(
+                label="Confidence",
+                scale=1,
+                lines=1
+            )
+    gr.Markdown(
+        """
+        ### Supported Signs
+        A-Z letters, Space, Nothing
+        ### Tips for better detection:
+        - Ensure good lighting
+        - Keep hand in frame
+        - Make clear gestures
+        - Hold the sign steady for 1-2 seconds
+        """
+    )
+    # Set up real-time prediction
+    input_image.stream(
+        fn=predict_sign_realtime,
+        inputs=input_image,
+        outputs=[output_image, predicted_text, confidence_text],
+        show_progress=False  # Hide progress for smoother experience
+    )
 if __name__ == "__main__":
     demo.launch()