d2j666's picture
Integrate ML models with model selector dropdown
4705494
import gradio as gr
import cv2
import numpy as np
from model import ASLDetector
from model_ml import ASLDetectorML
# Global detector cache for lazy loading
_detector_cache = {}
def get_detector(model_choice):
"""Get or create detector instance with lazy loading and caching."""
global _detector_cache
# Check if detector is already cached
if model_choice in _detector_cache:
return _detector_cache[model_choice]
# Create new detector instance
print(f"[INFO] Creating new detector: {model_choice}")
detector = ASLDetector() if model_choice == "MediaPipe (Rule-based)" else ASLDetectorML(model_name=model_choice)
# Cache for future use
_detector_cache[model_choice] = detector
return detector
def detect_asl(image, model_choice):
"""Process image and detect ASL gesture using selected model."""
print(f"[INFO] detect_asl called - model: {model_choice}, image type: {type(image)}, is None: {image is None}")
if image is None or not isinstance(image, np.ndarray):
print(f"[WARN] Invalid input - rejecting image")
return None, "Please provide an image (use Upload or capture from Webcam)"
print(f"[INFO] Image received - shape: {image.shape}, dtype: {image.dtype}")
# Convert to RGB if needed
if len(image.shape) == 2:
image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
print(f"[INFO] Converted grayscale to RGB")
elif len(image.shape) == 3 and image.shape[2] == 4:
image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
print(f"[INFO] Converted RGBA to RGB")
try:
# Get or create detector (lazy loading)
detector = get_detector(model_choice)
# Process image
annotated_image, letter, confidence = detector.process_frame(image)
print(f"[INFO] Detection result - letter: {letter}, confidence: {confidence}")
# Create result message
if letter and letter != "Unknown":
result = f"Detected: {letter} (Confidence: {confidence:.2f})\nModel: {model_choice}"
elif letter == "Unknown":
if model_choice == "MediaPipe (Rule-based)":
result = "Hand detected but gesture not recognized. Try: A, V, B, 1, or W"
else:
result = f"Hand detected but gesture not recognized.\nModel: {model_choice}"
else:
result = "No hand detected. Please show a clear hand gesture."
print(f"[INFO] Returning result: {result}")
return annotated_image, result
except Exception as e:
error_msg = f"Error loading model: {str(e)}\n\nPlease ensure models are uploaded to HuggingFace Hub.\nSee MODEL_SETUP.md for instructions."
print(f"[ERROR] {error_msg}")
return image, error_msg
# Create Gradio interface with tabs for different input methods
with gr.Blocks(title="ASL Hand Detection System") as demo:
gr.Markdown("""
# ASL Hand Detection System
American Sign Language hand gesture detection using MediaPipe and Deep Learning.
- **EfficientNetB4**: Balanced performance and speed (recommended)
- **EfficientNetB7**: Higher accuracy, slower inference
- **EfficientNetB9**: Highest accuracy, slowest inference
- **MediaPipe (Rule-based)**: Fast, lightweight fallback (5 gestures only)
**Supported Gestures (ML Models):** A-Z, del, nothing, space (29 total)
**MediaPipe Gestures:** A, V, B, 1, W (5 total)
""")
# Model selector dropdown
with gr.Row():
model_selector = gr.Dropdown(
choices=[
"EfficientNetB4",
"EfficientNetB7",
"EfficientNetB9",
"MediaPipe (Rule-based)"
],
value="MediaPipe (Rule-based)",
label="Select Model",
info="First-time model (EfficientNet Based) loading may take 5-10 seconds"
)
gr.Markdown("**Note:** Switching between ML models (B4/B7/B9) may take 5-10 seconds on first load as the model downloads from HuggingFace Hub. Subsequent uses will be instant.")
with gr.Tabs():
with gr.Tab("Take a Picture"):
with gr.Row():
with gr.Column():
webcam_input = gr.Image(
sources=["webcam"],
type="numpy",
label="Webcam",
interactive=True
)
webcam_btn = gr.Button("Detect Gesture", variant="primary")
with gr.Column():
webcam_output = gr.Image(label="Detected Hand Landmarks")
webcam_result = gr.Textbox(label="Detection Result", lines=3)
webcam_btn.click(
fn=detect_asl,
inputs=[webcam_input, model_selector],
outputs=[webcam_output, webcam_result]
)
with gr.Tab("Upload Image"):
with gr.Row():
with gr.Column():
upload_input = gr.Image(
sources=["upload"],
type="numpy",
label="Upload Image",
interactive=True
)
upload_btn = gr.Button("Detect Gesture", variant="primary")
with gr.Column():
upload_output = gr.Image(label="Detected Hand Landmarks")
upload_result = gr.Textbox(label="Detection Result", lines=3)
upload_btn.click(
fn=detect_asl,
inputs=[upload_input, model_selector],
outputs=[upload_output, upload_result]
)
with gr.Tab("Live Streaming"):
with gr.Row():
with gr.Column():
stream_input = gr.Image(
sources=["webcam"],
type="numpy",
label="Live Webcam Feed",
interactive=True,
streaming=True
)
with gr.Column():
stream_output = gr.Image(label="Detected Hand Landmarks")
stream_result = gr.Textbox(label="Detection Result", lines=3)
stream_input.stream(
fn=detect_asl,
inputs=[stream_input, model_selector],
outputs=[stream_output, stream_result]
)
if __name__ == "__main__":
try:
print("[INFO] Starting ASL Hand Detection System...")
print("[INFO] Note: First-time model loading may take 5-10 seconds")
demo.launch()
except KeyboardInterrupt:
print("\n[INFO] Shutting down gracefully...")
finally:
print("[INFO] Application stopped")