import gradio as gr import pickle import joblib import cv2 import mediapipe as mp import numpy as np from PIL import Image import warnings import os # Suppress sklearn version warnings warnings.filterwarnings('ignore', category=UserWarning) # Load the model with multiple fallback options def load_model(): """Try loading model from different formats""" if os.path.exists('./model.joblib'): print("Loading model from joblib...") return joblib.load('./model.joblib') elif os.path.exists('./model_v2.p'): print("Loading model from model_v2.p...") with open('./model_v2.p', 'rb') as f: model_dict = pickle.load(f) return model_dict['model'] elif os.path.exists('./model.p'): print("Loading model from model.p...") with open('./model.p', 'rb') as f: model_dict = pickle.load(f) return model_dict['model'] else: raise FileNotFoundError("No model file found!") try: model = load_model() print("✓ Model loaded successfully!") except Exception as e: print(f"✗ Error loading model: {e}") raise mp_hands = mp.solutions.hands mp_drawing = mp.solutions.drawing_utils mp_drawing_styles = mp.solutions.drawing_styles # Initialize hand detection - optimized for speed hands = mp_hands.Hands( static_image_mode=False, # False for video/real-time max_num_hands=2, min_detection_confidence=0.5, min_tracking_confidence=0.5 ) labels_dict = { 0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I', 9: 'J', 10: 'K', 11: 'L', 12: 'M', 13: 'N', 14: 'nothing', 15: 'O', 16: 'P', 17: 'Q', 18: 'R', 19: 'S', 20: 'space', 21: 'T', 22: 'U', 23: 'V', 24: 'W', 25: 'X', 26: 'Y', 27: 'Z' } # Store history for smoothing predictions prediction_history = [] HISTORY_SIZE = 5 def smooth_prediction(new_pred): """Smooth predictions to reduce jitter""" global prediction_history prediction_history.append(new_pred) if len(prediction_history) > HISTORY_SIZE: prediction_history.pop(0) # Return most common prediction if prediction_history: return max(set(prediction_history), key=prediction_history.count) return new_pred def predict_sign_realtime(image): """Process image and predict sign language character in real-time""" if image is None: return None, "No image provided", "" try: # Convert PIL Image to numpy array frame = np.array(image) # Convert RGB to BGR for OpenCV frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) H, W, _ = frame.shape # Convert back to RGB for MediaPipe frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Process the frame with MediaPipe results = hands.process(frame_rgb) predicted_character = "No hand detected" confidence_text = "" if results.multi_hand_landmarks: data_aux = [] x_all, y_all = [], [] if len(results.multi_hand_landmarks) == 2: # Two-hand sign for hand_landmarks in results.multi_hand_landmarks: x_, y_ = [], [] for i in range(len(hand_landmarks.landmark)): x = hand_landmarks.landmark[i].x y = hand_landmarks.landmark[i].y x_.append(x) y_.append(y) x_all.extend(x_) y_all.extend(y_) for i in range(len(hand_landmarks.landmark)): data_aux.append(hand_landmarks.landmark[i].x - min(x_)) data_aux.append(hand_landmarks.landmark[i].y - min(y_)) # Draw hand landmarks mp_drawing.draw_landmarks( frame, hand_landmarks, mp_hands.HAND_CONNECTIONS, mp_drawing_styles.get_default_hand_landmarks_style(), mp_drawing_styles.get_default_hand_connections_style() ) elif len(results.multi_hand_landmarks) == 1: # One-hand sign hand_landmarks = results.multi_hand_landmarks[0] x_, y_ = [], [] for i in range(len(hand_landmarks.landmark)): x = hand_landmarks.landmark[i].x y = hand_landmarks.landmark[i].y x_.append(x) y_.append(y) x_all.extend(x_) y_all.extend(y_) for i in range(len(hand_landmarks.landmark)): data_aux.append(hand_landmarks.landmark[i].x - min(x_)) data_aux.append(hand_landmarks.landmark[i].y - min(y_)) # Pad with zeros to match two-hand format data_aux.extend([0] * (84 - len(data_aux))) # Draw hand landmarks mp_drawing.draw_landmarks( frame, hand_landmarks, mp_hands.HAND_CONNECTIONS, mp_drawing_styles.get_default_hand_landmarks_style(), mp_drawing_styles.get_default_hand_connections_style() ) # Convert to NumPy array and predict try: prediction = model.predict([np.asarray(data_aux)]) raw_pred = labels_dict.get(prediction[0], str(prediction[0])) # Smooth prediction predicted_character = smooth_prediction(raw_pred) # Get confidence if available if hasattr(model, 'predict_proba'): proba = model.predict_proba([np.asarray(data_aux)]) confidence = np.max(proba) * 100 confidence_text = f"Confidence: {confidence:.1f}%" except Exception as e: predicted_character = f"Error: {str(e)}" print(f"Prediction error: {e}") # Draw the bounding box and prediction x1 = int(min(x_all) * W) - 10 y1 = int(min(y_all) * H) - 10 x2 = int(max(x_all) * W) + 10 y2 = int(max(y_all) * H) + 10 # Draw bounding box cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 3) # Draw prediction text with background text = predicted_character font = cv2.FONT_HERSHEY_SIMPLEX font_scale = 1.5 thickness = 3 # Get text size for background (text_width, text_height), baseline = cv2.getTextSize(text, font, font_scale, thickness) # Draw black background for text cv2.rectangle(frame, (x1, y1 - text_height - 20), (x1 + text_width + 10, y1), (0, 0, 0), -1) # Draw text cv2.putText(frame, text, (x1 + 5, y1 - 10), font, font_scale, (0, 255, 0), thickness, cv2.LINE_AA) # Convert BGR back to RGB for display frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) return frame, predicted_character, confidence_text except Exception as e: print(f"Error in predict_sign: {e}") return None, f"Error: {str(e)}", "" # Create Gradio interface with real-time streaming with gr.Blocks(title="Sign Language Recognition") as demo: gr.Markdown( """ # 🤟 Real-Time Sign Language Recognition Show your sign language gesture to the camera for real-time detection! """ ) with gr.Row(): with gr.Column(): input_image = gr.Image( sources=["webcam"], type="pil", label="Webcam Feed", streaming=True # Enable streaming for real-time ) with gr.Column(): output_image = gr.Image(label="Detected Sign") predicted_text = gr.Textbox( label="Predicted Character", scale=1, lines=1 ) confidence_text = gr.Textbox( label="Confidence", scale=1, lines=1 ) gr.Markdown( """ ### Supported Signs A-Z letters, Space, Nothing ### Tips for better detection: - Ensure good lighting - Keep hand in frame - Make clear gestures - Hold the sign steady for 1-2 seconds """ ) # Set up real-time prediction input_image.stream( fn=predict_sign_realtime, inputs=input_image, outputs=[output_image, predicted_text, confidence_text], show_progress=False # Hide progress for smoother experience ) if __name__ == "__main__": demo.launch()