inara / app.py
TiH0's picture
Update app.py
f640e22 verified
import gradio as gr
import pickle
import joblib
import cv2
import mediapipe as mp
import numpy as np
from PIL import Image
import warnings
import os
# Suppress sklearn version warnings
warnings.filterwarnings('ignore', category=UserWarning)
# Load the model with multiple fallback options
def load_model():
"""Try loading model from different formats"""
if os.path.exists('./model.joblib'):
print("Loading model from joblib...")
return joblib.load('./model.joblib')
elif os.path.exists('./model_v2.p'):
print("Loading model from model_v2.p...")
with open('./model_v2.p', 'rb') as f:
model_dict = pickle.load(f)
return model_dict['model']
elif os.path.exists('./model.p'):
print("Loading model from model.p...")
with open('./model.p', 'rb') as f:
model_dict = pickle.load(f)
return model_dict['model']
else:
raise FileNotFoundError("No model file found!")
try:
model = load_model()
print("✓ Model loaded successfully!")
except Exception as e:
print(f"✗ Error loading model: {e}")
raise
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
# Initialize hand detection - optimized for speed
hands = mp_hands.Hands(
static_image_mode=False, # False for video/real-time
max_num_hands=2,
min_detection_confidence=0.5,
min_tracking_confidence=0.5
)
labels_dict = {
0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I',
9: 'J', 10: 'K', 11: 'L', 12: 'M', 13: 'N', 14: 'nothing', 15: 'O', 16: 'P', 17: 'Q',
18: 'R', 19: 'S', 20: 'space', 21: 'T', 22: 'U', 23: 'V', 24: 'W', 25: 'X', 26: 'Y', 27: 'Z'
}
# Store history for smoothing predictions
prediction_history = []
HISTORY_SIZE = 5
def smooth_prediction(new_pred):
"""Smooth predictions to reduce jitter"""
global prediction_history
prediction_history.append(new_pred)
if len(prediction_history) > HISTORY_SIZE:
prediction_history.pop(0)
# Return most common prediction
if prediction_history:
return max(set(prediction_history), key=prediction_history.count)
return new_pred
def predict_sign_realtime(image):
"""Process image and predict sign language character in real-time"""
if image is None:
return None, "No image provided", ""
try:
# Convert PIL Image to numpy array
frame = np.array(image)
# Convert RGB to BGR for OpenCV
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
H, W, _ = frame.shape
# Convert back to RGB for MediaPipe
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Process the frame with MediaPipe
results = hands.process(frame_rgb)
predicted_character = "No hand detected"
confidence_text = ""
if results.multi_hand_landmarks:
data_aux = []
x_all, y_all = [], []
if len(results.multi_hand_landmarks) == 2: # Two-hand sign
for hand_landmarks in results.multi_hand_landmarks:
x_, y_ = [], []
for i in range(len(hand_landmarks.landmark)):
x = hand_landmarks.landmark[i].x
y = hand_landmarks.landmark[i].y
x_.append(x)
y_.append(y)
x_all.extend(x_)
y_all.extend(y_)
for i in range(len(hand_landmarks.landmark)):
data_aux.append(hand_landmarks.landmark[i].x - min(x_))
data_aux.append(hand_landmarks.landmark[i].y - min(y_))
# Draw hand landmarks
mp_drawing.draw_landmarks(
frame,
hand_landmarks,
mp_hands.HAND_CONNECTIONS,
mp_drawing_styles.get_default_hand_landmarks_style(),
mp_drawing_styles.get_default_hand_connections_style()
)
elif len(results.multi_hand_landmarks) == 1: # One-hand sign
hand_landmarks = results.multi_hand_landmarks[0]
x_, y_ = [], []
for i in range(len(hand_landmarks.landmark)):
x = hand_landmarks.landmark[i].x
y = hand_landmarks.landmark[i].y
x_.append(x)
y_.append(y)
x_all.extend(x_)
y_all.extend(y_)
for i in range(len(hand_landmarks.landmark)):
data_aux.append(hand_landmarks.landmark[i].x - min(x_))
data_aux.append(hand_landmarks.landmark[i].y - min(y_))
# Pad with zeros to match two-hand format
data_aux.extend([0] * (84 - len(data_aux)))
# Draw hand landmarks
mp_drawing.draw_landmarks(
frame,
hand_landmarks,
mp_hands.HAND_CONNECTIONS,
mp_drawing_styles.get_default_hand_landmarks_style(),
mp_drawing_styles.get_default_hand_connections_style()
)
# Convert to NumPy array and predict
try:
prediction = model.predict([np.asarray(data_aux)])
raw_pred = labels_dict.get(prediction[0], str(prediction[0]))
# Smooth prediction
predicted_character = smooth_prediction(raw_pred)
# Get confidence if available
if hasattr(model, 'predict_proba'):
proba = model.predict_proba([np.asarray(data_aux)])
confidence = np.max(proba) * 100
confidence_text = f"Confidence: {confidence:.1f}%"
except Exception as e:
predicted_character = f"Error: {str(e)}"
print(f"Prediction error: {e}")
# Draw the bounding box and prediction
x1 = int(min(x_all) * W) - 10
y1 = int(min(y_all) * H) - 10
x2 = int(max(x_all) * W) + 10
y2 = int(max(y_all) * H) + 10
# Draw bounding box
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 3)
# Draw prediction text with background
text = predicted_character
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 1.5
thickness = 3
# Get text size for background
(text_width, text_height), baseline = cv2.getTextSize(text, font, font_scale, thickness)
# Draw black background for text
cv2.rectangle(frame, (x1, y1 - text_height - 20), (x1 + text_width + 10, y1), (0, 0, 0), -1)
# Draw text
cv2.putText(frame, text, (x1 + 5, y1 - 10), font, font_scale, (0, 255, 0), thickness, cv2.LINE_AA)
# Convert BGR back to RGB for display
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
return frame, predicted_character, confidence_text
except Exception as e:
print(f"Error in predict_sign: {e}")
return None, f"Error: {str(e)}", ""
# Create Gradio interface with real-time streaming
with gr.Blocks(title="Sign Language Recognition") as demo:
gr.Markdown(
"""
# 🤟 Real-Time Sign Language Recognition
Show your sign language gesture to the camera for real-time detection!
"""
)
with gr.Row():
with gr.Column():
input_image = gr.Image(
sources=["webcam"],
type="pil",
label="Webcam Feed",
streaming=True # Enable streaming for real-time
)
with gr.Column():
output_image = gr.Image(label="Detected Sign")
predicted_text = gr.Textbox(
label="Predicted Character",
scale=1,
lines=1
)
confidence_text = gr.Textbox(
label="Confidence",
scale=1,
lines=1
)
gr.Markdown(
"""
### Supported Signs
A-Z letters, Space, Nothing
### Tips for better detection:
- Ensure good lighting
- Keep hand in frame
- Make clear gestures
- Hold the sign steady for 1-2 seconds
"""
)
# Set up real-time prediction
input_image.stream(
fn=predict_sign_realtime,
inputs=input_image,
outputs=[output_image, predicted_text, confidence_text],
show_progress=False # Hide progress for smoother experience
)
if __name__ == "__main__":
demo.launch()