Spaces:

RuslanKain
/

simple-gesture-predictor

Sleeping

RuslanKain

removedmirror_webcam for Gradio compatibility

2ae9d2d about 1 month ago

21 kB

	"""
	╔══════════════════════════════════════════════════════════════════════════════╗
	║ CISC 121 - HAND GESTURE RECOGNITION APP ║
	║ Queen's University ║
	║ ║
	║ PURPOSE: This app uses AI to recognize hand gestures (one, peace, etc.) ║
	║ VERSION: Procedural (step-by-step) - Great for beginners! ║
	║ ║
	║ HOW TO RUN: python app.py ║
	╚══════════════════════════════════════════════════════════════════════════════╝
	"""

	# ==============================================================================
	# SECTION 1: IMPORTS
	# ==============================================================================
	# What are imports?
	# Imports let us use code that other people wrote.
	# Instead of writing everything from scratch, we can use "libraries".
	#
	# Think of it like borrowing tools:
	# - gradio = tools for building web pages
	# - transformers = tools for AI/machine learning
	# - time = tools for measuring how long things take
	# - os = tools for working with the operating system (like reading files)
	# ==============================================================================

	import gradio as gr
	# "gr" is a short nickname for "gradio" - it saves us typing!
	# Example: instead of gradio.Button(), we can write gr.Button()

	from transformers import pipeline
	# "pipeline" is a function that makes using AI models easy.
	# It handles all the complicated setup for us.

	from time import perf_counter
	# "perf_counter" is like a stopwatch - it measures time very precisely.

	import os
	# "os" lets us interact with the operating system
	# We use it to read environment variables (like secret tokens)


	# ==============================================================================
	# SECTION 2: CONFIGURATION (SETTINGS)
	# ==============================================================================
	# What is configuration?
	# These are settings we can change to customize how the app works.
	# By putting them at the top, they're easy to find and modify.
	# ==============================================================================

	# The AI model we will use for hand gesture recognition
	#
	# MODEL OPTIONS:
	# 1. "dima806/hand_gestures_image_detection" (RECOMMENDED)
	# - Recognizes: one, two, three, four, fist, ok, like, peace, etc.
	# - Trained specifically for hand gestures!
	#
	# 2. "google/vit-base-patch16-224" (General purpose)
	# - Recognizes 1000 everyday objects (cats, cars, etc.)
	# - NOT trained for hand gestures - won't work for finger counting
	#
	# 3. "microsoft/resnet-50" (General purpose, faster)
	# - Similar to Google's model, but faster
	#
	MODEL_NAME = "dima806/hand_gestures_image_detection"

	# Hugging Face Token (Optional but recommended)
	# Some models require authentication to download.
	# Get your free token at: https://huggingface.co/settings/tokens
	#
	# Option 1: Set as environment variable (recommended for security)
	# export HF_TOKEN="your_token_here"
	#
	# Option 2: Paste directly here (less secure, but okay for learning)
	# HF_TOKEN = "hf_xxxxxxxxxxxxxxxxxxxxx"
	#
	HF_TOKEN = os.environ.get("HF_TOKEN", None)
	# os.environ.get() tries to read the HF_TOKEN from environment variables
	# If not found, it returns None (which means "no token")

	# App title and description
	APP_TITLE = "## 🎓 CISC 121 - Hand Gesture Recognition App"
	APP_DESCRIPTION = """
	Welcome! This app uses AI to recognize hand gestures.

	Supported Gestures:
	✋ one, ✌️ two/peace, 🤟 three, 🖖 four, ✊ fist, 👍 like, 👎 dislike, 👌 ok, 🤚 stop

	How to use:
	1. Upload an image OR use your webcam
	2. Show a hand gesture clearly in frame
	3. Click "🔍 Analyze Image" to see the AI's prediction

	> 💡 Tip: Make sure your hand is well-lit and clearly visible!
	"""


	# ==============================================================================
	# SECTION 3: HELPER FUNCTIONS
	# ==============================================================================
	# What are functions?
	# Functions are reusable blocks of code that do one specific job.
	# We give them a name, and then we can "call" them whenever we need them.
	#
	# Why use functions?
	# 1. Reusability - write once, use many times
	# 2. Organization - break big problems into small pieces
	# 3. Readability - give meaningful names to actions
	# ==============================================================================

	def create_greeting(name):
	"""
	Creates a personalized greeting message.

	What is a docstring? (This text you're reading!)
	A docstring explains what a function does.
	It helps other programmers (and future you!) understand the code.

	Parameters:
	-----------
	name : str
	The name of the person to greet.
	"str" means "string" - a piece of text.

	Returns:
	--------
	str
	A greeting message as a string.

	Example:
	--------
	>>> create_greeting("Alice")
	"Hello Alice! Welcome to CISC 121!"
	"""
	# f-strings let us put variables inside text
	# The {name} gets replaced with the actual value of 'name'
	greeting = f"Hello {name}! Welcome to CISC 121!"
	return greeting


	def analyze_image(image):
	"""
	Sends an image to the AI model and gets back predictions.

	How does this work?
	1. We send the image to Hugging Face's servers
	2. The AI model analyzes the image
	3. We get back a list of predictions with confidence scores

	Parameters:
	-----------
	image : PIL.Image or numpy.ndarray
	The image to analyze. Gradio handles the format for us.

	Returns:
	--------
	tuple
	A tuple containing:
	- results (list): The AI's predictions
	- elapsed_time (float): How long the analysis took in seconds

	What is a tuple?
	A tuple is like a container that holds multiple values.
	We use it when a function needs to return more than one thing.
	"""
	# Safety check: make sure we actually received an image
	# "None" means "nothing" - the user might not have taken a photo yet
	if image is None:
	print("⚠️ No image provided")
	return None, 0.0

	# Debug: Print what type of image we received
	print(f"📷 Received image type: {type(image)}")
	print(f"📷 Image info: {image if not hasattr(image, 'size') else f'Size: {image.size}'}")

	# Start the stopwatch
	start_time = perf_counter()

	# Create the AI classifier
	# "pipeline" sets up everything we need to use the model
	try:
	print(f"🔄 Loading model: {MODEL_NAME}")
	print(f"🔑 HF Token: {'Set' if HF_TOKEN else 'Not set (may limit some models)'}")

	# Create the classifier with optional token
	classifier = pipeline(
	task="image-classification", # What kind of task?
	model=MODEL_NAME, # Which AI model to use?
	token=HF_TOKEN # Authentication token (optional)
	)

	print("📷 Analyzing image...")

	# Handle different image formats that Gradio might send
	# Gradio can send: PIL Image, numpy array, or file path
	from PIL import Image

	if isinstance(image, str):
	# It's a file path - open it
	print(" (Converting from file path)")
	image = Image.open(image)
	elif hasattr(image, 'convert'):
	# It's already a PIL Image - ensure it's in RGB format
	print(" (Image is PIL format)")
	if image.mode != 'RGB':
	image = image.convert('RGB')
	else:
	# It might be a numpy array - convert to PIL
	print(" (Converting from numpy array)")
	import numpy as np
	if isinstance(image, np.ndarray):
	image = Image.fromarray(image)

	# Send the image to the model and get predictions
	results = classifier(image)

	print(f"✅ Analysis complete! Found {len(results)} predictions.")

	except Exception as error:
	# If something goes wrong, we catch the error
	# This prevents the app from crashing
	print(f"❌ Error during image analysis: {error}")
	print(f" Error type: {type(error).__name__}")

	# Print full traceback for debugging
	import traceback
	traceback.print_exc()

	# Common error explanations
	if "401" in str(error) or "unauthorized" in str(error).lower():
	print(" 💡 This might be an authentication issue. Try setting HF_TOKEN.")
	elif "connection" in str(error).lower() or "network" in str(error).lower():
	print(" 💡 Check your internet connection.")
	elif "memory" in str(error).lower():
	print(" 💡 The model might be too large. Try a smaller model.")

	return None, 0.0

	# Stop the stopwatch
	end_time = perf_counter()

	# Calculate how long it took
	elapsed_time = end_time - start_time

	return results, elapsed_time


	def format_results(results, elapsed_time):
	"""
	Formats the AI predictions into a readable string.

	Why format results?
	The raw data from the AI is hard to read.
	We transform it into a nice, human-friendly format.

	Parameters:
	-----------
	results : list or None
	The predictions from the AI model.
	Each prediction has a 'label' and a 'score' (confidence).

	elapsed_time : float
	How long the analysis took, in seconds.

	Returns:
	--------
	str
	A formatted string showing the predictions.
	"""
	# Handle the case where analysis failed
	if results is None:
	return "❌ Could not analyze the image. Please try again."

	# Start building our output message
	output_lines = []

	# Add a header
	output_lines.append("## 🔍 Analysis Results\n")
	output_lines.append(f"⏱️ Analysis completed in {elapsed_time:.2f} seconds\n")

	# What does :.2f mean?
	# It formats a number to show 2 decimal places.
	# Example: 1.23456 becomes "1.23"

	output_lines.append("### Top Predictions:\n")

	# Loop through the top 5 predictions
	# enumerate() gives us both the index (i) and the item (prediction)
	for i, prediction in enumerate(results[:5]):
	label = prediction['label'] # What the AI thinks it sees
	score = prediction['score'] # How confident it is (0 to 1)
	percentage = score * 100 # Convert to percentage

	# Add a medal emoji for top 3
	if i == 0:
	medal = "🥇"
	elif i == 1:
	medal = "🥈"
	elif i == 2:
	medal = "🥉"
	else:
	medal = " "

	output_lines.append(f"{medal} {label}: {percentage:.1f}%\n")

	# Join all lines into one string
	# '\n' means "new line" (like pressing Enter)
	return ''.join(output_lines)


	# ==============================================================================
	# SECTION 4: MAIN APPLICATION
	# ==============================================================================
	# This section builds the actual web interface.
	# We use Gradio's "Blocks" system to create a custom layout.
	#
	# What is gr.Blocks()?
	# It's like a container for our app.
	# Everything inside the "with" block becomes part of the interface.
	#
	# What does "with" do?
	# "with" creates a context - it's like saying "everything in here belongs together"
	# When we exit the "with" block, Gradio knows our app is complete.
	# ==============================================================================

	def create_app():
	"""
	Creates and returns the Gradio application.

	Why put this in a function?
	1. It keeps the code organized
	2. We can easily test or modify the app
	3. It's a good habit for larger programs

	Returns:
	--------
	gr.Blocks
	The complete Gradio application, ready to launch.
	"""

	# Create the app container
	# Note: We use try/except for theme to support different Gradio versions
	# Older versions don't support the theme parameter the same way
	try:
	# Try modern Gradio syntax (4.x+)
	app = gr.Blocks(
	title="CISC 121 Gesture App", # Browser tab title
	theme=gr.themes.Soft() # A nice, modern look
	)
	except TypeError:
	# Fall back for older Gradio versions
	app = gr.Blocks(title="CISC 121 Gesture App")

	with app:

	# ----------------------------------------------------------------------
	# PART A: HEADER SECTION
	# ----------------------------------------------------------------------
	# gr.Markdown() lets us add formatted text using Markdown syntax
	# Markdown is a simple way to format text (like in README files)

	gr.Markdown(APP_TITLE)
	gr.Markdown(APP_DESCRIPTION)

	# Add a horizontal line for visual separation
	gr.Markdown("---")

	# ----------------------------------------------------------------------
	# PART B: IMAGE INPUT AND RESULTS SECTION
	# ----------------------------------------------------------------------
	# gr.Row() puts components side by side (horizontal layout)
	# gr.Column() stacks components on top of each other (vertical layout)

	with gr.Row():

	# Left column: Image input
	with gr.Column(scale=1):
	gr.Markdown("### 📸 Image Input")

	# Create tabs for different input methods
	# This makes it clearer for users how to provide an image
	with gr.Tabs():

	# Tab 1: Upload an image file
	with gr.TabItem("📁 Upload"):
	upload_input = gr.Image(
	label="Click to upload or drag an image here",
	sources=["upload"],
	type="pil",
	height=250
	)

	# Tab 2: Use webcam (captures on click)
	with gr.TabItem("📷 Webcam"):
	webcam_input = gr.Image(
	label="Click the 📷 button below the preview to capture",
	sources=["webcam"],
	type="pil",
	height=250
	# Note: mirror_webcam removed for compatibility with older Gradio
	)

	# Status indicator - shows when image is ready
	status_display = gr.Markdown("👆 Choose a tab above and provide an image")

	# The submit button
	submit_button = gr.Button(
	value="🔍 Analyze Image",
	variant="primary",
	size="lg"
	)

	# Right column: Results
	with gr.Column(scale=1):
	gr.Markdown("### 📊 Results")

	# gr.Markdown() can also display dynamic content
	# We'll update this when the user clicks the button
	results_display = gr.Markdown(
	value="Upload or capture an image, then click 'Analyze Image' to see results."
	)

	# ----------------------------------------------------------------------
	# PART C: CONNECTING COMPONENTS (EVENT HANDLING)
	# ----------------------------------------------------------------------
	# Now we connect the inputs to our functions.
	# We have TWO input sources (upload and webcam) that both need to work.

	# State variable to store the current image (from either source)
	# gr.State() is a special Gradio component that stores data between interactions
	current_image = gr.State(value=None)

	def on_upload(image):
	"""Called when user uploads an image."""
	if image is not None:
	return image, "✅ Image uploaded! Click 'Analyze Image' to continue."
	return None, "👆 Choose a tab above and provide an image"

	def on_webcam_capture(image):
	"""Called when user captures from webcam."""
	if image is not None:
	return image, "✅ Photo captured! Click 'Analyze Image' to continue."
	return None, "👆 Choose a tab above and provide an image"

	def on_submit(stored_image):
	"""
	This function runs when the user clicks the submit button.

	It's called an "event handler" because it handles the click event.

	Parameters:
	-----------
	stored_image : PIL.Image
	The image stored from upload or webcam capture.

	Returns:
	--------
	str
	Formatted results to display.
	"""
	# Check if we have an image
	if stored_image is None:
	return "⚠️ No image detected!\n\nTo fix this:\n\n📁 Upload Tab: Click the upload area and select an image file\n\n📷 Webcam Tab: Click the camera button (📷) to capture a photo\n\nThen click 'Analyze Image' again."

	# Step 1: Analyze the image
	results, elapsed_time = analyze_image(stored_image)

	# Step 2: Format the results nicely
	formatted = format_results(results, elapsed_time)

	# Step 3: Return the formatted text (Gradio displays it)
	return formatted

	# Connect upload input - when image changes, store it
	upload_input.change(
	fn=on_upload,
	inputs=[upload_input],
	outputs=[current_image, status_display]
	)

	# Connect webcam input - when image is captured, store it
	webcam_input.change(
	fn=on_webcam_capture,
	inputs=[webcam_input],
	outputs=[current_image, status_display]
	)

	# Connect the button click to analyze the stored image
	submit_button.click(
	fn=on_submit,
	inputs=[current_image],
	outputs=[results_display]
	)

	# ----------------------------------------------------------------------
	# PART D: FOOTER
	# ----------------------------------------------------------------------
	gr.Markdown("---")
	gr.Markdown(
	"Made for CISC 121 at Queen's University 🎓"
	)

	# Return the completed app
	return app


	# ==============================================================================
	# SECTION 5: RUNNING THE APP
	# ==============================================================================
	# This is where we actually start the application.
	#
	# What does if __name__ == "__main__" mean?
	# This checks if we're running this file directly (not importing it).
	# If we run: python hf_gradio_proj.py → this code runs
	# If we import: from hf_gradio_proj import create_app → this code doesn't run
	#
	# Why is this useful?
	# It lets us use the same file in two ways:
	# 1. As a standalone app (run it directly)
	# 2. As a module (import functions into other files)
	# ==============================================================================

	if __name__ == "__main__":
	# Print a welcome message to the terminal
	print("=" * 60)
	print("🎓 CISC 121 - Gesture Recognition App")
	print("=" * 60)
	print("Starting the application...")
	print("Once ready, open the URL shown below in your browser.")
	print("=" * 60)

	# Create the app
	app = create_app()

	# Launch the app
	# share=True creates a public URL anyone can access
	# This is useful for sharing with classmates or instructors
	app.launch(share=True)