Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import cv2 | |
| import numpy as np | |
| from model import ASLDetector | |
| from model_ml import ASLDetectorML | |
| # Global detector cache for lazy loading | |
| _detector_cache = {} | |
| def get_detector(model_choice): | |
| """Get or create detector instance with lazy loading and caching.""" | |
| global _detector_cache | |
| # Check if detector is already cached | |
| if model_choice in _detector_cache: | |
| return _detector_cache[model_choice] | |
| # Create new detector instance | |
| print(f"[INFO] Creating new detector: {model_choice}") | |
| detector = ASLDetector() if model_choice == "MediaPipe (Rule-based)" else ASLDetectorML(model_name=model_choice) | |
| # Cache for future use | |
| _detector_cache[model_choice] = detector | |
| return detector | |
| def detect_asl(image, model_choice): | |
| """Process image and detect ASL gesture using selected model.""" | |
| print(f"[INFO] detect_asl called - model: {model_choice}, image type: {type(image)}, is None: {image is None}") | |
| if image is None or not isinstance(image, np.ndarray): | |
| print(f"[WARN] Invalid input - rejecting image") | |
| return None, "Please provide an image (use Upload or capture from Webcam)" | |
| print(f"[INFO] Image received - shape: {image.shape}, dtype: {image.dtype}") | |
| # Convert to RGB if needed | |
| if len(image.shape) == 2: | |
| image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) | |
| print(f"[INFO] Converted grayscale to RGB") | |
| elif len(image.shape) == 3 and image.shape[2] == 4: | |
| image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB) | |
| print(f"[INFO] Converted RGBA to RGB") | |
| try: | |
| # Get or create detector (lazy loading) | |
| detector = get_detector(model_choice) | |
| # Process image | |
| annotated_image, letter, confidence = detector.process_frame(image) | |
| print(f"[INFO] Detection result - letter: {letter}, confidence: {confidence}") | |
| # Create result message | |
| if letter and letter != "Unknown": | |
| result = f"Detected: {letter} (Confidence: {confidence:.2f})\nModel: {model_choice}" | |
| elif letter == "Unknown": | |
| if model_choice == "MediaPipe (Rule-based)": | |
| result = "Hand detected but gesture not recognized. Try: A, V, B, 1, or W" | |
| else: | |
| result = f"Hand detected but gesture not recognized.\nModel: {model_choice}" | |
| else: | |
| result = "No hand detected. Please show a clear hand gesture." | |
| print(f"[INFO] Returning result: {result}") | |
| return annotated_image, result | |
| except Exception as e: | |
| error_msg = f"Error loading model: {str(e)}\n\nPlease ensure models are uploaded to HuggingFace Hub.\nSee MODEL_SETUP.md for instructions." | |
| print(f"[ERROR] {error_msg}") | |
| return image, error_msg | |
| # Create Gradio interface with tabs for different input methods | |
| with gr.Blocks(title="ASL Hand Detection System") as demo: | |
| gr.Markdown(""" | |
| # ASL Hand Detection System | |
| American Sign Language hand gesture detection using MediaPipe and Deep Learning. | |
| - **EfficientNetB4**: Balanced performance and speed (recommended) | |
| - **EfficientNetB7**: Higher accuracy, slower inference | |
| - **EfficientNetB9**: Highest accuracy, slowest inference | |
| - **MediaPipe (Rule-based)**: Fast, lightweight fallback (5 gestures only) | |
| **Supported Gestures (ML Models):** A-Z, del, nothing, space (29 total) | |
| **MediaPipe Gestures:** A, V, B, 1, W (5 total) | |
| """) | |
| # Model selector dropdown | |
| with gr.Row(): | |
| model_selector = gr.Dropdown( | |
| choices=[ | |
| "EfficientNetB4", | |
| "EfficientNetB7", | |
| "EfficientNetB9", | |
| "MediaPipe (Rule-based)" | |
| ], | |
| value="MediaPipe (Rule-based)", | |
| label="Select Model", | |
| info="First-time model (EfficientNet Based) loading may take 5-10 seconds" | |
| ) | |
| gr.Markdown("**Note:** Switching between ML models (B4/B7/B9) may take 5-10 seconds on first load as the model downloads from HuggingFace Hub. Subsequent uses will be instant.") | |
| with gr.Tabs(): | |
| with gr.Tab("Take a Picture"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| webcam_input = gr.Image( | |
| sources=["webcam"], | |
| type="numpy", | |
| label="Webcam", | |
| interactive=True | |
| ) | |
| webcam_btn = gr.Button("Detect Gesture", variant="primary") | |
| with gr.Column(): | |
| webcam_output = gr.Image(label="Detected Hand Landmarks") | |
| webcam_result = gr.Textbox(label="Detection Result", lines=3) | |
| webcam_btn.click( | |
| fn=detect_asl, | |
| inputs=[webcam_input, model_selector], | |
| outputs=[webcam_output, webcam_result] | |
| ) | |
| with gr.Tab("Upload Image"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| upload_input = gr.Image( | |
| sources=["upload"], | |
| type="numpy", | |
| label="Upload Image", | |
| interactive=True | |
| ) | |
| upload_btn = gr.Button("Detect Gesture", variant="primary") | |
| with gr.Column(): | |
| upload_output = gr.Image(label="Detected Hand Landmarks") | |
| upload_result = gr.Textbox(label="Detection Result", lines=3) | |
| upload_btn.click( | |
| fn=detect_asl, | |
| inputs=[upload_input, model_selector], | |
| outputs=[upload_output, upload_result] | |
| ) | |
| with gr.Tab("Live Streaming"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| stream_input = gr.Image( | |
| sources=["webcam"], | |
| type="numpy", | |
| label="Live Webcam Feed", | |
| interactive=True, | |
| streaming=True | |
| ) | |
| with gr.Column(): | |
| stream_output = gr.Image(label="Detected Hand Landmarks") | |
| stream_result = gr.Textbox(label="Detection Result", lines=3) | |
| stream_input.stream( | |
| fn=detect_asl, | |
| inputs=[stream_input, model_selector], | |
| outputs=[stream_output, stream_result] | |
| ) | |
| if __name__ == "__main__": | |
| try: | |
| print("[INFO] Starting ASL Hand Detection System...") | |
| print("[INFO] Note: First-time model loading may take 5-10 seconds") | |
| demo.launch() | |
| except KeyboardInterrupt: | |
| print("\n[INFO] Shutting down gracefully...") | |
| finally: | |
| print("[INFO] Application stopped") |