Spaces:
Sleeping
Sleeping
File size: 21,021 Bytes
5378b38 fdfd4e1 5378b38 2ae9d2d 5378b38 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 |
"""
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β CISC 121 - HAND GESTURE RECOGNITION APP β
β Queen's University β
β β
β PURPOSE: This app uses AI to recognize hand gestures (one, peace, etc.) β
β VERSION: Procedural (step-by-step) - Great for beginners! β
β β
β HOW TO RUN: python app.py β
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"""
# ==============================================================================
# SECTION 1: IMPORTS
# ==============================================================================
# What are imports?
# Imports let us use code that other people wrote.
# Instead of writing everything from scratch, we can use "libraries".
#
# Think of it like borrowing tools:
# - gradio = tools for building web pages
# - transformers = tools for AI/machine learning
# - time = tools for measuring how long things take
# - os = tools for working with the operating system (like reading files)
# ==============================================================================
import gradio as gr
# "gr" is a short nickname for "gradio" - it saves us typing!
# Example: instead of gradio.Button(), we can write gr.Button()
from transformers import pipeline
# "pipeline" is a function that makes using AI models easy.
# It handles all the complicated setup for us.
from time import perf_counter
# "perf_counter" is like a stopwatch - it measures time very precisely.
import os
# "os" lets us interact with the operating system
# We use it to read environment variables (like secret tokens)
# ==============================================================================
# SECTION 2: CONFIGURATION (SETTINGS)
# ==============================================================================
# What is configuration?
# These are settings we can change to customize how the app works.
# By putting them at the top, they're easy to find and modify.
# ==============================================================================
# The AI model we will use for hand gesture recognition
#
# MODEL OPTIONS:
# 1. "dima806/hand_gestures_image_detection" (RECOMMENDED)
# - Recognizes: one, two, three, four, fist, ok, like, peace, etc.
# - Trained specifically for hand gestures!
#
# 2. "google/vit-base-patch16-224" (General purpose)
# - Recognizes 1000 everyday objects (cats, cars, etc.)
# - NOT trained for hand gestures - won't work for finger counting
#
# 3. "microsoft/resnet-50" (General purpose, faster)
# - Similar to Google's model, but faster
#
MODEL_NAME = "dima806/hand_gestures_image_detection"
# Hugging Face Token (Optional but recommended)
# Some models require authentication to download.
# Get your free token at: https://huggingface.co/settings/tokens
#
# Option 1: Set as environment variable (recommended for security)
# export HF_TOKEN="your_token_here"
#
# Option 2: Paste directly here (less secure, but okay for learning)
# HF_TOKEN = "hf_xxxxxxxxxxxxxxxxxxxxx"
#
HF_TOKEN = os.environ.get("HF_TOKEN", None)
# os.environ.get() tries to read the HF_TOKEN from environment variables
# If not found, it returns None (which means "no token")
# App title and description
APP_TITLE = "## π CISC 121 - Hand Gesture Recognition App"
APP_DESCRIPTION = """
Welcome! This app uses AI to recognize **hand gestures**.
**Supported Gestures:**
β one, βοΈ two/peace, π€ three, π four, β fist, π like, π dislike, π ok, π€ stop
**How to use:**
1. **Upload an image** OR **use your webcam**
2. Show a hand gesture clearly in frame
3. Click **"π Analyze Image"** to see the AI's prediction
> π‘ **Tip:** Make sure your hand is well-lit and clearly visible!
"""
# ==============================================================================
# SECTION 3: HELPER FUNCTIONS
# ==============================================================================
# What are functions?
# Functions are reusable blocks of code that do one specific job.
# We give them a name, and then we can "call" them whenever we need them.
#
# Why use functions?
# 1. Reusability - write once, use many times
# 2. Organization - break big problems into small pieces
# 3. Readability - give meaningful names to actions
# ==============================================================================
def create_greeting(name):
"""
Creates a personalized greeting message.
What is a docstring? (This text you're reading!)
A docstring explains what a function does.
It helps other programmers (and future you!) understand the code.
Parameters:
-----------
name : str
The name of the person to greet.
"str" means "string" - a piece of text.
Returns:
--------
str
A greeting message as a string.
Example:
--------
>>> create_greeting("Alice")
"Hello Alice! Welcome to CISC 121!"
"""
# f-strings let us put variables inside text
# The {name} gets replaced with the actual value of 'name'
greeting = f"Hello {name}! Welcome to CISC 121!"
return greeting
def analyze_image(image):
"""
Sends an image to the AI model and gets back predictions.
How does this work?
1. We send the image to Hugging Face's servers
2. The AI model analyzes the image
3. We get back a list of predictions with confidence scores
Parameters:
-----------
image : PIL.Image or numpy.ndarray
The image to analyze. Gradio handles the format for us.
Returns:
--------
tuple
A tuple containing:
- results (list): The AI's predictions
- elapsed_time (float): How long the analysis took in seconds
What is a tuple?
A tuple is like a container that holds multiple values.
We use it when a function needs to return more than one thing.
"""
# Safety check: make sure we actually received an image
# "None" means "nothing" - the user might not have taken a photo yet
if image is None:
print("β οΈ No image provided")
return None, 0.0
# Debug: Print what type of image we received
print(f"π· Received image type: {type(image)}")
print(f"π· Image info: {image if not hasattr(image, 'size') else f'Size: {image.size}'}")
# Start the stopwatch
start_time = perf_counter()
# Create the AI classifier
# "pipeline" sets up everything we need to use the model
try:
print(f"π Loading model: {MODEL_NAME}")
print(f"π HF Token: {'Set' if HF_TOKEN else 'Not set (may limit some models)'}")
# Create the classifier with optional token
classifier = pipeline(
task="image-classification", # What kind of task?
model=MODEL_NAME, # Which AI model to use?
token=HF_TOKEN # Authentication token (optional)
)
print("π· Analyzing image...")
# Handle different image formats that Gradio might send
# Gradio can send: PIL Image, numpy array, or file path
from PIL import Image
if isinstance(image, str):
# It's a file path - open it
print(" (Converting from file path)")
image = Image.open(image)
elif hasattr(image, 'convert'):
# It's already a PIL Image - ensure it's in RGB format
print(" (Image is PIL format)")
if image.mode != 'RGB':
image = image.convert('RGB')
else:
# It might be a numpy array - convert to PIL
print(" (Converting from numpy array)")
import numpy as np
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
# Send the image to the model and get predictions
results = classifier(image)
print(f"β
Analysis complete! Found {len(results)} predictions.")
except Exception as error:
# If something goes wrong, we catch the error
# This prevents the app from crashing
print(f"β Error during image analysis: {error}")
print(f" Error type: {type(error).__name__}")
# Print full traceback for debugging
import traceback
traceback.print_exc()
# Common error explanations
if "401" in str(error) or "unauthorized" in str(error).lower():
print(" π‘ This might be an authentication issue. Try setting HF_TOKEN.")
elif "connection" in str(error).lower() or "network" in str(error).lower():
print(" π‘ Check your internet connection.")
elif "memory" in str(error).lower():
print(" π‘ The model might be too large. Try a smaller model.")
return None, 0.0
# Stop the stopwatch
end_time = perf_counter()
# Calculate how long it took
elapsed_time = end_time - start_time
return results, elapsed_time
def format_results(results, elapsed_time):
"""
Formats the AI predictions into a readable string.
Why format results?
The raw data from the AI is hard to read.
We transform it into a nice, human-friendly format.
Parameters:
-----------
results : list or None
The predictions from the AI model.
Each prediction has a 'label' and a 'score' (confidence).
elapsed_time : float
How long the analysis took, in seconds.
Returns:
--------
str
A formatted string showing the predictions.
"""
# Handle the case where analysis failed
if results is None:
return "β Could not analyze the image. Please try again."
# Start building our output message
output_lines = []
# Add a header
output_lines.append("## π Analysis Results\n")
output_lines.append(f"β±οΈ *Analysis completed in {elapsed_time:.2f} seconds*\n")
# What does :.2f mean?
# It formats a number to show 2 decimal places.
# Example: 1.23456 becomes "1.23"
output_lines.append("### Top Predictions:\n")
# Loop through the top 5 predictions
# enumerate() gives us both the index (i) and the item (prediction)
for i, prediction in enumerate(results[:5]):
label = prediction['label'] # What the AI thinks it sees
score = prediction['score'] # How confident it is (0 to 1)
percentage = score * 100 # Convert to percentage
# Add a medal emoji for top 3
if i == 0:
medal = "π₯"
elif i == 1:
medal = "π₯"
elif i == 2:
medal = "π₯"
else:
medal = " "
output_lines.append(f"{medal} **{label}**: {percentage:.1f}%\n")
# Join all lines into one string
# '\n' means "new line" (like pressing Enter)
return ''.join(output_lines)
# ==============================================================================
# SECTION 4: MAIN APPLICATION
# ==============================================================================
# This section builds the actual web interface.
# We use Gradio's "Blocks" system to create a custom layout.
#
# What is gr.Blocks()?
# It's like a container for our app.
# Everything inside the "with" block becomes part of the interface.
#
# What does "with" do?
# "with" creates a context - it's like saying "everything in here belongs together"
# When we exit the "with" block, Gradio knows our app is complete.
# ==============================================================================
def create_app():
"""
Creates and returns the Gradio application.
Why put this in a function?
1. It keeps the code organized
2. We can easily test or modify the app
3. It's a good habit for larger programs
Returns:
--------
gr.Blocks
The complete Gradio application, ready to launch.
"""
# Create the app container
# Note: We use try/except for theme to support different Gradio versions
# Older versions don't support the theme parameter the same way
try:
# Try modern Gradio syntax (4.x+)
app = gr.Blocks(
title="CISC 121 Gesture App", # Browser tab title
theme=gr.themes.Soft() # A nice, modern look
)
except TypeError:
# Fall back for older Gradio versions
app = gr.Blocks(title="CISC 121 Gesture App")
with app:
# ----------------------------------------------------------------------
# PART A: HEADER SECTION
# ----------------------------------------------------------------------
# gr.Markdown() lets us add formatted text using Markdown syntax
# Markdown is a simple way to format text (like in README files)
gr.Markdown(APP_TITLE)
gr.Markdown(APP_DESCRIPTION)
# Add a horizontal line for visual separation
gr.Markdown("---")
# ----------------------------------------------------------------------
# PART B: IMAGE INPUT AND RESULTS SECTION
# ----------------------------------------------------------------------
# gr.Row() puts components side by side (horizontal layout)
# gr.Column() stacks components on top of each other (vertical layout)
with gr.Row():
# Left column: Image input
with gr.Column(scale=1):
gr.Markdown("### πΈ Image Input")
# Create tabs for different input methods
# This makes it clearer for users how to provide an image
with gr.Tabs():
# Tab 1: Upload an image file
with gr.TabItem("π Upload"):
upload_input = gr.Image(
label="Click to upload or drag an image here",
sources=["upload"],
type="pil",
height=250
)
# Tab 2: Use webcam (captures on click)
with gr.TabItem("π· Webcam"):
webcam_input = gr.Image(
label="Click the π· button below the preview to capture",
sources=["webcam"],
type="pil",
height=250
# Note: mirror_webcam removed for compatibility with older Gradio
)
# Status indicator - shows when image is ready
status_display = gr.Markdown("π *Choose a tab above and provide an image*")
# The submit button
submit_button = gr.Button(
value="π Analyze Image",
variant="primary",
size="lg"
)
# Right column: Results
with gr.Column(scale=1):
gr.Markdown("### π Results")
# gr.Markdown() can also display dynamic content
# We'll update this when the user clicks the button
results_display = gr.Markdown(
value="*Upload or capture an image, then click 'Analyze Image' to see results.*"
)
# ----------------------------------------------------------------------
# PART C: CONNECTING COMPONENTS (EVENT HANDLING)
# ----------------------------------------------------------------------
# Now we connect the inputs to our functions.
# We have TWO input sources (upload and webcam) that both need to work.
# State variable to store the current image (from either source)
# gr.State() is a special Gradio component that stores data between interactions
current_image = gr.State(value=None)
def on_upload(image):
"""Called when user uploads an image."""
if image is not None:
return image, "β
**Image uploaded!** Click 'Analyze Image' to continue."
return None, "π *Choose a tab above and provide an image*"
def on_webcam_capture(image):
"""Called when user captures from webcam."""
if image is not None:
return image, "β
**Photo captured!** Click 'Analyze Image' to continue."
return None, "π *Choose a tab above and provide an image*"
def on_submit(stored_image):
"""
This function runs when the user clicks the submit button.
It's called an "event handler" because it handles the click event.
Parameters:
-----------
stored_image : PIL.Image
The image stored from upload or webcam capture.
Returns:
--------
str
Formatted results to display.
"""
# Check if we have an image
if stored_image is None:
return "β οΈ **No image detected!**\n\n**To fix this:**\n\nπ **Upload Tab:** Click the upload area and select an image file\n\nπ· **Webcam Tab:** Click the camera button (π·) to capture a photo\n\nThen click 'Analyze Image' again."
# Step 1: Analyze the image
results, elapsed_time = analyze_image(stored_image)
# Step 2: Format the results nicely
formatted = format_results(results, elapsed_time)
# Step 3: Return the formatted text (Gradio displays it)
return formatted
# Connect upload input - when image changes, store it
upload_input.change(
fn=on_upload,
inputs=[upload_input],
outputs=[current_image, status_display]
)
# Connect webcam input - when image is captured, store it
webcam_input.change(
fn=on_webcam_capture,
inputs=[webcam_input],
outputs=[current_image, status_display]
)
# Connect the button click to analyze the stored image
submit_button.click(
fn=on_submit,
inputs=[current_image],
outputs=[results_display]
)
# ----------------------------------------------------------------------
# PART D: FOOTER
# ----------------------------------------------------------------------
gr.Markdown("---")
gr.Markdown(
"*Made for CISC 121 at Queen's University* π"
)
# Return the completed app
return app
# ==============================================================================
# SECTION 5: RUNNING THE APP
# ==============================================================================
# This is where we actually start the application.
#
# What does if __name__ == "__main__" mean?
# This checks if we're running this file directly (not importing it).
# If we run: python hf_gradio_proj.py β this code runs
# If we import: from hf_gradio_proj import create_app β this code doesn't run
#
# Why is this useful?
# It lets us use the same file in two ways:
# 1. As a standalone app (run it directly)
# 2. As a module (import functions into other files)
# ==============================================================================
if __name__ == "__main__":
# Print a welcome message to the terminal
print("=" * 60)
print("π CISC 121 - Gesture Recognition App")
print("=" * 60)
print("Starting the application...")
print("Once ready, open the URL shown below in your browser.")
print("=" * 60)
# Create the app
app = create_app()
# Launch the app
# share=True creates a public URL anyone can access
# This is useful for sharing with classmates or instructors
app.launch(share=True)
|