Spaces:

TiH0
/

inara

Sleeping

App Files Files Community

inara / app.py

TiH0

Update app.py

f640e22 verified 5 months ago

raw

history blame contribute delete

9.2 kB

	import gradio as gr
	import pickle
	import joblib
	import cv2
	import mediapipe as mp
	import numpy as np
	from PIL import Image
	import warnings
	import os

	# Suppress sklearn version warnings
	warnings.filterwarnings('ignore', category=UserWarning)

	# Load the model with multiple fallback options
	def load_model():
	"""Try loading model from different formats"""

	if os.path.exists('./model.joblib'):
	print("Loading model from joblib...")
	return joblib.load('./model.joblib')
	elif os.path.exists('./model_v2.p'):
	print("Loading model from model_v2.p...")
	with open('./model_v2.p', 'rb') as f:
	model_dict = pickle.load(f)
	return model_dict['model']
	elif os.path.exists('./model.p'):
	print("Loading model from model.p...")
	with open('./model.p', 'rb') as f:
	model_dict = pickle.load(f)
	return model_dict['model']
	else:
	raise FileNotFoundError("No model file found!")

	try:
	model = load_model()
	print("✓ Model loaded successfully!")
	except Exception as e:
	print(f"✗ Error loading model: {e}")
	raise

	mp_hands = mp.solutions.hands
	mp_drawing = mp.solutions.drawing_utils
	mp_drawing_styles = mp.solutions.drawing_styles

	# Initialize hand detection - optimized for speed
	hands = mp_hands.Hands(
	static_image_mode=False, # False for video/real-time
	max_num_hands=2,
	min_detection_confidence=0.5,
	min_tracking_confidence=0.5
	)

	labels_dict = {
	0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I',
	9: 'J', 10: 'K', 11: 'L', 12: 'M', 13: 'N', 14: 'nothing', 15: 'O', 16: 'P', 17: 'Q',
	18: 'R', 19: 'S', 20: 'space', 21: 'T', 22: 'U', 23: 'V', 24: 'W', 25: 'X', 26: 'Y', 27: 'Z'
	}

	# Store history for smoothing predictions
	prediction_history = []
	HISTORY_SIZE = 5

	def smooth_prediction(new_pred):
	"""Smooth predictions to reduce jitter"""
	global prediction_history
	prediction_history.append(new_pred)
	if len(prediction_history) > HISTORY_SIZE:
	prediction_history.pop(0)

	# Return most common prediction
	if prediction_history:
	return max(set(prediction_history), key=prediction_history.count)
	return new_pred

	def predict_sign_realtime(image):
	"""Process image and predict sign language character in real-time"""

	if image is None:
	return None, "No image provided", ""

	try:
	# Convert PIL Image to numpy array
	frame = np.array(image)

	# Convert RGB to BGR for OpenCV
	frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

	H, W, _ = frame.shape

	# Convert back to RGB for MediaPipe
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

	# Process the frame with MediaPipe
	results = hands.process(frame_rgb)

	predicted_character = "No hand detected"
	confidence_text = ""

	if results.multi_hand_landmarks:
	data_aux = []
	x_all, y_all = [], []

	if len(results.multi_hand_landmarks) == 2: # Two-hand sign
	for hand_landmarks in results.multi_hand_landmarks:
	x_, y_ = [], []

	for i in range(len(hand_landmarks.landmark)):
	x = hand_landmarks.landmark[i].x
	y = hand_landmarks.landmark[i].y
	x_.append(x)
	y_.append(y)

	x_all.extend(x_)
	y_all.extend(y_)

	for i in range(len(hand_landmarks.landmark)):
	data_aux.append(hand_landmarks.landmark[i].x - min(x_))
	data_aux.append(hand_landmarks.landmark[i].y - min(y_))

	# Draw hand landmarks
	mp_drawing.draw_landmarks(
	frame,
	hand_landmarks,
	mp_hands.HAND_CONNECTIONS,
	mp_drawing_styles.get_default_hand_landmarks_style(),
	mp_drawing_styles.get_default_hand_connections_style()
	)

	elif len(results.multi_hand_landmarks) == 1: # One-hand sign
	hand_landmarks = results.multi_hand_landmarks[0]
	x_, y_ = [], []

	for i in range(len(hand_landmarks.landmark)):
	x = hand_landmarks.landmark[i].x
	y = hand_landmarks.landmark[i].y
	x_.append(x)
	y_.append(y)

	x_all.extend(x_)
	y_all.extend(y_)

	for i in range(len(hand_landmarks.landmark)):
	data_aux.append(hand_landmarks.landmark[i].x - min(x_))
	data_aux.append(hand_landmarks.landmark[i].y - min(y_))

	# Pad with zeros to match two-hand format
	data_aux.extend([0] * (84 - len(data_aux)))

	# Draw hand landmarks
	mp_drawing.draw_landmarks(
	frame,
	hand_landmarks,
	mp_hands.HAND_CONNECTIONS,
	mp_drawing_styles.get_default_hand_landmarks_style(),
	mp_drawing_styles.get_default_hand_connections_style()
	)

	# Convert to NumPy array and predict
	try:
	prediction = model.predict([np.asarray(data_aux)])
	raw_pred = labels_dict.get(prediction[0], str(prediction[0]))

	# Smooth prediction
	predicted_character = smooth_prediction(raw_pred)

	# Get confidence if available
	if hasattr(model, 'predict_proba'):
	proba = model.predict_proba([np.asarray(data_aux)])
	confidence = np.max(proba) * 100
	confidence_text = f"Confidence: {confidence:.1f}%"

	except Exception as e:
	predicted_character = f"Error: {str(e)}"
	print(f"Prediction error: {e}")

	# Draw the bounding box and prediction
	x1 = int(min(x_all) * W) - 10
	y1 = int(min(y_all) * H) - 10
	x2 = int(max(x_all) * W) + 10
	y2 = int(max(y_all) * H) + 10

	# Draw bounding box
	cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 3)

	# Draw prediction text with background
	text = predicted_character
	font = cv2.FONT_HERSHEY_SIMPLEX
	font_scale = 1.5
	thickness = 3

	# Get text size for background
	(text_width, text_height), baseline = cv2.getTextSize(text, font, font_scale, thickness)

	# Draw black background for text
	cv2.rectangle(frame, (x1, y1 - text_height - 20), (x1 + text_width + 10, y1), (0, 0, 0), -1)

	# Draw text
	cv2.putText(frame, text, (x1 + 5, y1 - 10), font, font_scale, (0, 255, 0), thickness, cv2.LINE_AA)

	# Convert BGR back to RGB for display
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

	return frame, predicted_character, confidence_text

	except Exception as e:
	print(f"Error in predict_sign: {e}")
	return None, f"Error: {str(e)}", ""

	# Create Gradio interface with real-time streaming
	with gr.Blocks(title="Sign Language Recognition") as demo:
	gr.Markdown(
	"""
	# 🤟 Real-Time Sign Language Recognition
	Show your sign language gesture to the camera for real-time detection!
	"""
	)

	with gr.Row():
	with gr.Column():
	input_image = gr.Image(
	sources=["webcam"],
	type="pil",
	label="Webcam Feed",
	streaming=True # Enable streaming for real-time
	)

	with gr.Column():
	output_image = gr.Image(label="Detected Sign")
	predicted_text = gr.Textbox(
	label="Predicted Character",
	scale=1,
	lines=1
	)
	confidence_text = gr.Textbox(
	label="Confidence",
	scale=1,
	lines=1
	)

	gr.Markdown(
	"""
	### Supported Signs
	A-Z letters, Space, Nothing

	### Tips for better detection:
	- Ensure good lighting
	- Keep hand in frame
	- Make clear gestures
	- Hold the sign steady for 1-2 seconds
	"""
	)

	# Set up real-time prediction
	input_image.stream(
	fn=predict_sign_realtime,
	inputs=input_image,
	outputs=[output_image, predicted_text, confidence_text],
	show_progress=False # Hide progress for smoother experience
	)

	if __name__ == "__main__":
	demo.launch()