Spaces:

RuslanKain
/

simple-gesture-predictor

Sleeping

File size: 21,021 Bytes

"""
╔══════════════════════════════════════════════════════════════════════════════╗
║                    CISC 121 - HAND GESTURE RECOGNITION APP                  ║
║                         Queen's University                                   ║
║                                                                              ║
║  PURPOSE: This app uses AI to recognize hand gestures (one, peace, etc.)   ║
║  VERSION: Procedural (step-by-step) - Great for beginners!                  ║
║                                                                              ║
║  HOW TO RUN: python app.py                                                   ║
╚══════════════════════════════════════════════════════════════════════════════╝
"""

# ==============================================================================
# SECTION 1: IMPORTS
# ==============================================================================
# What are imports?
#   Imports let us use code that other people wrote.
#   Instead of writing everything from scratch, we can use "libraries".
#
# Think of it like borrowing tools:
#   - gradio = tools for building web pages
#   - transformers = tools for AI/machine learning
#   - time = tools for measuring how long things take
#   - os = tools for working with the operating system (like reading files)
# ==============================================================================

import gradio as gr
# "gr" is a short nickname for "gradio" - it saves us typing!
# Example: instead of gradio.Button(), we can write gr.Button()

from transformers import pipeline
# "pipeline" is a function that makes using AI models easy.
# It handles all the complicated setup for us.

from time import perf_counter
# "perf_counter" is like a stopwatch - it measures time very precisely.

import os
# "os" lets us interact with the operating system
# We use it to read environment variables (like secret tokens)


# ==============================================================================
# SECTION 2: CONFIGURATION (SETTINGS)
# ==============================================================================
# What is configuration?
#   These are settings we can change to customize how the app works.
#   By putting them at the top, they're easy to find and modify.
# ==============================================================================

# The AI model we will use for hand gesture recognition
# 
# MODEL OPTIONS:
#   1. "dima806/hand_gestures_image_detection" (RECOMMENDED)
#      - Recognizes: one, two, three, four, fist, ok, like, peace, etc.
#      - Trained specifically for hand gestures!
#
#   2. "google/vit-base-patch16-224" (General purpose)
#      - Recognizes 1000 everyday objects (cats, cars, etc.)
#      - NOT trained for hand gestures - won't work for finger counting
#
#   3. "microsoft/resnet-50" (General purpose, faster)
#      - Similar to Google's model, but faster
#
MODEL_NAME = "dima806/hand_gestures_image_detection"

# Hugging Face Token (Optional but recommended)
# Some models require authentication to download.
# Get your free token at: https://huggingface.co/settings/tokens
#
# Option 1: Set as environment variable (recommended for security)
#   export HF_TOKEN="your_token_here"
#
# Option 2: Paste directly here (less secure, but okay for learning)
#   HF_TOKEN = "hf_xxxxxxxxxxxxxxxxxxxxx"
#
HF_TOKEN = os.environ.get("HF_TOKEN", None)
# os.environ.get() tries to read the HF_TOKEN from environment variables
# If not found, it returns None (which means "no token")

# App title and description
APP_TITLE = "## 🎓 CISC 121 - Hand Gesture Recognition App"
APP_DESCRIPTION = """
Welcome! This app uses AI to recognize **hand gestures**.

**Supported Gestures:**
✋ one, ✌️ two/peace, 🤟 three, 🖖 four, ✊ fist, 👍 like, 👎 dislike, 👌 ok, 🤚 stop

**How to use:**
1. **Upload an image** OR **use your webcam**
2. Show a hand gesture clearly in frame
3. Click **"🔍 Analyze Image"** to see the AI's prediction

> 💡 **Tip:** Make sure your hand is well-lit and clearly visible!
"""


# ==============================================================================
# SECTION 3: HELPER FUNCTIONS
# ==============================================================================
# What are functions?
#   Functions are reusable blocks of code that do one specific job.
#   We give them a name, and then we can "call" them whenever we need them.
#
# Why use functions?
#   1. Reusability - write once, use many times
#   2. Organization - break big problems into small pieces
#   3. Readability - give meaningful names to actions
# ==============================================================================

def create_greeting(name):
    """
    Creates a personalized greeting message.
    
    What is a docstring? (This text you're reading!)
        A docstring explains what a function does.
        It helps other programmers (and future you!) understand the code.
    
    Parameters:
    -----------
    name : str
        The name of the person to greet.
        "str" means "string" - a piece of text.
    
    Returns:
    --------
    str
        A greeting message as a string.
    
    Example:
    --------
    >>> create_greeting("Alice")
    "Hello Alice! Welcome to CISC 121!"
    """
    # f-strings let us put variables inside text
    # The {name} gets replaced with the actual value of 'name'
    greeting = f"Hello {name}! Welcome to CISC 121!"
    return greeting


def analyze_image(image):
    """
    Sends an image to the AI model and gets back predictions.
    
    How does this work?
        1. We send the image to Hugging Face's servers
        2. The AI model analyzes the image
        3. We get back a list of predictions with confidence scores
    
    Parameters:
    -----------
    image : PIL.Image or numpy.ndarray
        The image to analyze. Gradio handles the format for us.
    
    Returns:
    --------
    tuple
        A tuple containing:
        - results (list): The AI's predictions
        - elapsed_time (float): How long the analysis took in seconds
    
    What is a tuple?
        A tuple is like a container that holds multiple values.
        We use it when a function needs to return more than one thing.
    """
    # Safety check: make sure we actually received an image
    # "None" means "nothing" - the user might not have taken a photo yet
    if image is None:
        print("⚠️ No image provided")
        return None, 0.0
    
    # Debug: Print what type of image we received
    print(f"📷 Received image type: {type(image)}")
    print(f"📷 Image info: {image if not hasattr(image, 'size') else f'Size: {image.size}'}")
    
    # Start the stopwatch
    start_time = perf_counter()
    
    # Create the AI classifier
    # "pipeline" sets up everything we need to use the model
    try:
        print(f"🔄 Loading model: {MODEL_NAME}")
        print(f"🔑 HF Token: {'Set' if HF_TOKEN else 'Not set (may limit some models)'}")
        
        # Create the classifier with optional token
        classifier = pipeline(
            task="image-classification",  # What kind of task?
            model=MODEL_NAME,             # Which AI model to use?
            token=HF_TOKEN                # Authentication token (optional)
        )
        
        print("📷 Analyzing image...")
        
        # Handle different image formats that Gradio might send
        # Gradio can send: PIL Image, numpy array, or file path
        from PIL import Image
        
        if isinstance(image, str):
            # It's a file path - open it
            print("   (Converting from file path)")
            image = Image.open(image)
        elif hasattr(image, 'convert'):
            # It's already a PIL Image - ensure it's in RGB format
            print("   (Image is PIL format)")
            if image.mode != 'RGB':
                image = image.convert('RGB')
        else:
            # It might be a numpy array - convert to PIL
            print("   (Converting from numpy array)")
            import numpy as np
            if isinstance(image, np.ndarray):
                image = Image.fromarray(image)
        
        # Send the image to the model and get predictions
        results = classifier(image)
        
        print(f"✅ Analysis complete! Found {len(results)} predictions.")
        
    except Exception as error:
        # If something goes wrong, we catch the error
        # This prevents the app from crashing
        print(f"❌ Error during image analysis: {error}")
        print(f"   Error type: {type(error).__name__}")
        
        # Print full traceback for debugging
        import traceback
        traceback.print_exc()
        
        # Common error explanations
        if "401" in str(error) or "unauthorized" in str(error).lower():
            print("   💡 This might be an authentication issue. Try setting HF_TOKEN.")
        elif "connection" in str(error).lower() or "network" in str(error).lower():
            print("   💡 Check your internet connection.")
        elif "memory" in str(error).lower():
            print("   💡 The model might be too large. Try a smaller model.")
        
        return None, 0.0
    
    # Stop the stopwatch
    end_time = perf_counter()
    
    # Calculate how long it took
    elapsed_time = end_time - start_time
    
    return results, elapsed_time


def format_results(results, elapsed_time):
    """
    Formats the AI predictions into a readable string.
    
    Why format results?
        The raw data from the AI is hard to read.
        We transform it into a nice, human-friendly format.
    
    Parameters:
    -----------
    results : list or None
        The predictions from the AI model.
        Each prediction has a 'label' and a 'score' (confidence).
    
    elapsed_time : float
        How long the analysis took, in seconds.
    
    Returns:
    --------
    str
        A formatted string showing the predictions.
    """
    # Handle the case where analysis failed
    if results is None:
        return "❌ Could not analyze the image. Please try again."
    
    # Start building our output message
    output_lines = []
    
    # Add a header
    output_lines.append("## 🔍 Analysis Results\n")
    output_lines.append(f"⏱️ *Analysis completed in {elapsed_time:.2f} seconds*\n")
    
    # What does :.2f mean?
    #   It formats a number to show 2 decimal places.
    #   Example: 1.23456 becomes "1.23"
    
    output_lines.append("### Top Predictions:\n")
    
    # Loop through the top 5 predictions
    # enumerate() gives us both the index (i) and the item (prediction)
    for i, prediction in enumerate(results[:5]):
        label = prediction['label']       # What the AI thinks it sees
        score = prediction['score']       # How confident it is (0 to 1)
        percentage = score * 100          # Convert to percentage
        
        # Add a medal emoji for top 3
        if i == 0:
            medal = "🥇"
        elif i == 1:
            medal = "🥈"
        elif i == 2:
            medal = "🥉"
        else:
            medal = "  "
        
        output_lines.append(f"{medal} **{label}**: {percentage:.1f}%\n")
    
    # Join all lines into one string
    # '\n' means "new line" (like pressing Enter)
    return ''.join(output_lines)


# ==============================================================================
# SECTION 4: MAIN APPLICATION
# ==============================================================================
# This section builds the actual web interface.
# We use Gradio's "Blocks" system to create a custom layout.
#
# What is gr.Blocks()?
#   It's like a container for our app.
#   Everything inside the "with" block becomes part of the interface.
#
# What does "with" do?
#   "with" creates a context - it's like saying "everything in here belongs together"
#   When we exit the "with" block, Gradio knows our app is complete.
# ==============================================================================

def create_app():
    """
    Creates and returns the Gradio application.
    
    Why put this in a function?
        1. It keeps the code organized
        2. We can easily test or modify the app
        3. It's a good habit for larger programs
    
    Returns:
    --------
    gr.Blocks
        The complete Gradio application, ready to launch.
    """
    
    # Create the app container
    # Note: We use try/except for theme to support different Gradio versions
    # Older versions don't support the theme parameter the same way
    try:
        # Try modern Gradio syntax (4.x+)
        app = gr.Blocks(
            title="CISC 121 Gesture App",  # Browser tab title
            theme=gr.themes.Soft()          # A nice, modern look
        )
    except TypeError:
        # Fall back for older Gradio versions
        app = gr.Blocks(title="CISC 121 Gesture App")
    
    with app:
        
        # ----------------------------------------------------------------------
        # PART A: HEADER SECTION
        # ----------------------------------------------------------------------
        # gr.Markdown() lets us add formatted text using Markdown syntax
        # Markdown is a simple way to format text (like in README files)
        
        gr.Markdown(APP_TITLE)
        gr.Markdown(APP_DESCRIPTION)
        
        # Add a horizontal line for visual separation
        gr.Markdown("---")
        
        # ----------------------------------------------------------------------
        # PART B: IMAGE INPUT AND RESULTS SECTION
        # ----------------------------------------------------------------------
        # gr.Row() puts components side by side (horizontal layout)
        # gr.Column() stacks components on top of each other (vertical layout)
        
        with gr.Row():
            
            # Left column: Image input
            with gr.Column(scale=1):
                gr.Markdown("### 📸 Image Input")
                
                # Create tabs for different input methods
                # This makes it clearer for users how to provide an image
                with gr.Tabs():
                    
                    # Tab 1: Upload an image file
                    with gr.TabItem("📁 Upload"):
                        upload_input = gr.Image(
                            label="Click to upload or drag an image here",
                            sources=["upload"],
                            type="pil",
                            height=250
                        )
                    
                    # Tab 2: Use webcam (captures on click)
                    with gr.TabItem("📷 Webcam"):
                        webcam_input = gr.Image(
                            label="Click the 📷 button below the preview to capture",
                            sources=["webcam"],
                            type="pil",
                            height=250
                            # Note: mirror_webcam removed for compatibility with older Gradio
                        )
                
                # Status indicator - shows when image is ready
                status_display = gr.Markdown("👆 *Choose a tab above and provide an image*")
                
                # The submit button
                submit_button = gr.Button(
                    value="🔍 Analyze Image",
                    variant="primary",
                    size="lg"
                )
            
            # Right column: Results
            with gr.Column(scale=1):
                gr.Markdown("### 📊 Results")
                
                # gr.Markdown() can also display dynamic content
                # We'll update this when the user clicks the button
                results_display = gr.Markdown(
                    value="*Upload or capture an image, then click 'Analyze Image' to see results.*"
                )
        
        # ----------------------------------------------------------------------
        # PART C: CONNECTING COMPONENTS (EVENT HANDLING)
        # ----------------------------------------------------------------------
        # Now we connect the inputs to our functions.
        # We have TWO input sources (upload and webcam) that both need to work.
        
        # State variable to store the current image (from either source)
        # gr.State() is a special Gradio component that stores data between interactions
        current_image = gr.State(value=None)
        
        def on_upload(image):
            """Called when user uploads an image."""
            if image is not None:
                return image, "✅ **Image uploaded!** Click 'Analyze Image' to continue."
            return None, "👆 *Choose a tab above and provide an image*"
        
        def on_webcam_capture(image):
            """Called when user captures from webcam."""
            if image is not None:
                return image, "✅ **Photo captured!** Click 'Analyze Image' to continue."
            return None, "👆 *Choose a tab above and provide an image*"
        
        def on_submit(stored_image):
            """
            This function runs when the user clicks the submit button.
            
            It's called an "event handler" because it handles the click event.
            
            Parameters:
            -----------
            stored_image : PIL.Image
                The image stored from upload or webcam capture.
            
            Returns:
            --------
            str
                Formatted results to display.
            """
            # Check if we have an image
            if stored_image is None:
                return "⚠️ **No image detected!**\n\n**To fix this:**\n\n📁 **Upload Tab:** Click the upload area and select an image file\n\n📷 **Webcam Tab:** Click the camera button (📷) to capture a photo\n\nThen click 'Analyze Image' again."
            
            # Step 1: Analyze the image
            results, elapsed_time = analyze_image(stored_image)
            
            # Step 2: Format the results nicely
            formatted = format_results(results, elapsed_time)
            
            # Step 3: Return the formatted text (Gradio displays it)
            return formatted
        
        # Connect upload input - when image changes, store it
        upload_input.change(
            fn=on_upload,
            inputs=[upload_input],
            outputs=[current_image, status_display]
        )
        
        # Connect webcam input - when image is captured, store it
        webcam_input.change(
            fn=on_webcam_capture,
            inputs=[webcam_input],
            outputs=[current_image, status_display]
        )
        
        # Connect the button click to analyze the stored image
        submit_button.click(
            fn=on_submit,
            inputs=[current_image],
            outputs=[results_display]
        )
        
        # ----------------------------------------------------------------------
        # PART D: FOOTER
        # ----------------------------------------------------------------------
        gr.Markdown("---")
        gr.Markdown(
            "*Made for CISC 121 at Queen's University* 🎓"
        )
    
    # Return the completed app
    return app


# ==============================================================================
# SECTION 5: RUNNING THE APP
# ==============================================================================
# This is where we actually start the application.
#
# What does if __name__ == "__main__" mean?
#   This checks if we're running this file directly (not importing it).
#   If we run: python hf_gradio_proj.py → this code runs
#   If we import: from hf_gradio_proj import create_app → this code doesn't run
#
# Why is this useful?
#   It lets us use the same file in two ways:
#   1. As a standalone app (run it directly)
#   2. As a module (import functions into other files)
# ==============================================================================

if __name__ == "__main__":
    # Print a welcome message to the terminal
    print("=" * 60)
    print("🎓 CISC 121 - Gesture Recognition App")
    print("=" * 60)
    print("Starting the application...")
    print("Once ready, open the URL shown below in your browser.")
    print("=" * 60)
    
    # Create the app
    app = create_app()
    
    # Launch the app
    # share=True creates a public URL anyone can access
    # This is useful for sharing with classmates or instructors
    app.launch(share=True)