File size: 21,021 Bytes
5378b38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdfd4e1
 
 
 
 
 
 
 
 
 
 
 
 
5378b38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ae9d2d
 
5378b38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
"""
╔══════════════════════════════════════════════════════════════════════════════╗
β•‘                    CISC 121 - HAND GESTURE RECOGNITION APP                  β•‘
β•‘                         Queen's University                                   β•‘
β•‘                                                                              β•‘
β•‘  PURPOSE: This app uses AI to recognize hand gestures (one, peace, etc.)   β•‘
β•‘  VERSION: Procedural (step-by-step) - Great for beginners!                  β•‘
β•‘                                                                              β•‘
β•‘  HOW TO RUN: python app.py                                                   β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
"""

# ==============================================================================
# SECTION 1: IMPORTS
# ==============================================================================
# What are imports?
#   Imports let us use code that other people wrote.
#   Instead of writing everything from scratch, we can use "libraries".
#
# Think of it like borrowing tools:
#   - gradio = tools for building web pages
#   - transformers = tools for AI/machine learning
#   - time = tools for measuring how long things take
#   - os = tools for working with the operating system (like reading files)
# ==============================================================================

import gradio as gr
# "gr" is a short nickname for "gradio" - it saves us typing!
# Example: instead of gradio.Button(), we can write gr.Button()

from transformers import pipeline
# "pipeline" is a function that makes using AI models easy.
# It handles all the complicated setup for us.

from time import perf_counter
# "perf_counter" is like a stopwatch - it measures time very precisely.

import os
# "os" lets us interact with the operating system
# We use it to read environment variables (like secret tokens)


# ==============================================================================
# SECTION 2: CONFIGURATION (SETTINGS)
# ==============================================================================
# What is configuration?
#   These are settings we can change to customize how the app works.
#   By putting them at the top, they're easy to find and modify.
# ==============================================================================

# The AI model we will use for hand gesture recognition
# 
# MODEL OPTIONS:
#   1. "dima806/hand_gestures_image_detection" (RECOMMENDED)
#      - Recognizes: one, two, three, four, fist, ok, like, peace, etc.
#      - Trained specifically for hand gestures!
#
#   2. "google/vit-base-patch16-224" (General purpose)
#      - Recognizes 1000 everyday objects (cats, cars, etc.)
#      - NOT trained for hand gestures - won't work for finger counting
#
#   3. "microsoft/resnet-50" (General purpose, faster)
#      - Similar to Google's model, but faster
#
MODEL_NAME = "dima806/hand_gestures_image_detection"

# Hugging Face Token (Optional but recommended)
# Some models require authentication to download.
# Get your free token at: https://huggingface.co/settings/tokens
#
# Option 1: Set as environment variable (recommended for security)
#   export HF_TOKEN="your_token_here"
#
# Option 2: Paste directly here (less secure, but okay for learning)
#   HF_TOKEN = "hf_xxxxxxxxxxxxxxxxxxxxx"
#
HF_TOKEN = os.environ.get("HF_TOKEN", None)
# os.environ.get() tries to read the HF_TOKEN from environment variables
# If not found, it returns None (which means "no token")

# App title and description
APP_TITLE = "## πŸŽ“ CISC 121 - Hand Gesture Recognition App"
APP_DESCRIPTION = """
Welcome! This app uses AI to recognize **hand gestures**.

**Supported Gestures:**
βœ‹ one, ✌️ two/peace, 🀟 three, πŸ–– four, ✊ fist, πŸ‘ like, πŸ‘Ž dislike, πŸ‘Œ ok, 🀚 stop

**How to use:**
1. **Upload an image** OR **use your webcam**
2. Show a hand gesture clearly in frame
3. Click **"πŸ” Analyze Image"** to see the AI's prediction

> πŸ’‘ **Tip:** Make sure your hand is well-lit and clearly visible!
"""


# ==============================================================================
# SECTION 3: HELPER FUNCTIONS
# ==============================================================================
# What are functions?
#   Functions are reusable blocks of code that do one specific job.
#   We give them a name, and then we can "call" them whenever we need them.
#
# Why use functions?
#   1. Reusability - write once, use many times
#   2. Organization - break big problems into small pieces
#   3. Readability - give meaningful names to actions
# ==============================================================================

def create_greeting(name):
    """
    Creates a personalized greeting message.
    
    What is a docstring? (This text you're reading!)
        A docstring explains what a function does.
        It helps other programmers (and future you!) understand the code.
    
    Parameters:
    -----------
    name : str
        The name of the person to greet.
        "str" means "string" - a piece of text.
    
    Returns:
    --------
    str
        A greeting message as a string.
    
    Example:
    --------
    >>> create_greeting("Alice")
    "Hello Alice! Welcome to CISC 121!"
    """
    # f-strings let us put variables inside text
    # The {name} gets replaced with the actual value of 'name'
    greeting = f"Hello {name}! Welcome to CISC 121!"
    return greeting


def analyze_image(image):
    """
    Sends an image to the AI model and gets back predictions.
    
    How does this work?
        1. We send the image to Hugging Face's servers
        2. The AI model analyzes the image
        3. We get back a list of predictions with confidence scores
    
    Parameters:
    -----------
    image : PIL.Image or numpy.ndarray
        The image to analyze. Gradio handles the format for us.
    
    Returns:
    --------
    tuple
        A tuple containing:
        - results (list): The AI's predictions
        - elapsed_time (float): How long the analysis took in seconds
    
    What is a tuple?
        A tuple is like a container that holds multiple values.
        We use it when a function needs to return more than one thing.
    """
    # Safety check: make sure we actually received an image
    # "None" means "nothing" - the user might not have taken a photo yet
    if image is None:
        print("⚠️ No image provided")
        return None, 0.0
    
    # Debug: Print what type of image we received
    print(f"πŸ“· Received image type: {type(image)}")
    print(f"πŸ“· Image info: {image if not hasattr(image, 'size') else f'Size: {image.size}'}")
    
    # Start the stopwatch
    start_time = perf_counter()
    
    # Create the AI classifier
    # "pipeline" sets up everything we need to use the model
    try:
        print(f"πŸ”„ Loading model: {MODEL_NAME}")
        print(f"πŸ”‘ HF Token: {'Set' if HF_TOKEN else 'Not set (may limit some models)'}")
        
        # Create the classifier with optional token
        classifier = pipeline(
            task="image-classification",  # What kind of task?
            model=MODEL_NAME,             # Which AI model to use?
            token=HF_TOKEN                # Authentication token (optional)
        )
        
        print("πŸ“· Analyzing image...")
        
        # Handle different image formats that Gradio might send
        # Gradio can send: PIL Image, numpy array, or file path
        from PIL import Image
        
        if isinstance(image, str):
            # It's a file path - open it
            print("   (Converting from file path)")
            image = Image.open(image)
        elif hasattr(image, 'convert'):
            # It's already a PIL Image - ensure it's in RGB format
            print("   (Image is PIL format)")
            if image.mode != 'RGB':
                image = image.convert('RGB')
        else:
            # It might be a numpy array - convert to PIL
            print("   (Converting from numpy array)")
            import numpy as np
            if isinstance(image, np.ndarray):
                image = Image.fromarray(image)
        
        # Send the image to the model and get predictions
        results = classifier(image)
        
        print(f"βœ… Analysis complete! Found {len(results)} predictions.")
        
    except Exception as error:
        # If something goes wrong, we catch the error
        # This prevents the app from crashing
        print(f"❌ Error during image analysis: {error}")
        print(f"   Error type: {type(error).__name__}")
        
        # Print full traceback for debugging
        import traceback
        traceback.print_exc()
        
        # Common error explanations
        if "401" in str(error) or "unauthorized" in str(error).lower():
            print("   πŸ’‘ This might be an authentication issue. Try setting HF_TOKEN.")
        elif "connection" in str(error).lower() or "network" in str(error).lower():
            print("   πŸ’‘ Check your internet connection.")
        elif "memory" in str(error).lower():
            print("   πŸ’‘ The model might be too large. Try a smaller model.")
        
        return None, 0.0
    
    # Stop the stopwatch
    end_time = perf_counter()
    
    # Calculate how long it took
    elapsed_time = end_time - start_time
    
    return results, elapsed_time


def format_results(results, elapsed_time):
    """
    Formats the AI predictions into a readable string.
    
    Why format results?
        The raw data from the AI is hard to read.
        We transform it into a nice, human-friendly format.
    
    Parameters:
    -----------
    results : list or None
        The predictions from the AI model.
        Each prediction has a 'label' and a 'score' (confidence).
    
    elapsed_time : float
        How long the analysis took, in seconds.
    
    Returns:
    --------
    str
        A formatted string showing the predictions.
    """
    # Handle the case where analysis failed
    if results is None:
        return "❌ Could not analyze the image. Please try again."
    
    # Start building our output message
    output_lines = []
    
    # Add a header
    output_lines.append("## πŸ” Analysis Results\n")
    output_lines.append(f"⏱️ *Analysis completed in {elapsed_time:.2f} seconds*\n")
    
    # What does :.2f mean?
    #   It formats a number to show 2 decimal places.
    #   Example: 1.23456 becomes "1.23"
    
    output_lines.append("### Top Predictions:\n")
    
    # Loop through the top 5 predictions
    # enumerate() gives us both the index (i) and the item (prediction)
    for i, prediction in enumerate(results[:5]):
        label = prediction['label']       # What the AI thinks it sees
        score = prediction['score']       # How confident it is (0 to 1)
        percentage = score * 100          # Convert to percentage
        
        # Add a medal emoji for top 3
        if i == 0:
            medal = "πŸ₯‡"
        elif i == 1:
            medal = "πŸ₯ˆ"
        elif i == 2:
            medal = "πŸ₯‰"
        else:
            medal = "  "
        
        output_lines.append(f"{medal} **{label}**: {percentage:.1f}%\n")
    
    # Join all lines into one string
    # '\n' means "new line" (like pressing Enter)
    return ''.join(output_lines)


# ==============================================================================
# SECTION 4: MAIN APPLICATION
# ==============================================================================
# This section builds the actual web interface.
# We use Gradio's "Blocks" system to create a custom layout.
#
# What is gr.Blocks()?
#   It's like a container for our app.
#   Everything inside the "with" block becomes part of the interface.
#
# What does "with" do?
#   "with" creates a context - it's like saying "everything in here belongs together"
#   When we exit the "with" block, Gradio knows our app is complete.
# ==============================================================================

def create_app():
    """
    Creates and returns the Gradio application.
    
    Why put this in a function?
        1. It keeps the code organized
        2. We can easily test or modify the app
        3. It's a good habit for larger programs
    
    Returns:
    --------
    gr.Blocks
        The complete Gradio application, ready to launch.
    """
    
    # Create the app container
    # Note: We use try/except for theme to support different Gradio versions
    # Older versions don't support the theme parameter the same way
    try:
        # Try modern Gradio syntax (4.x+)
        app = gr.Blocks(
            title="CISC 121 Gesture App",  # Browser tab title
            theme=gr.themes.Soft()          # A nice, modern look
        )
    except TypeError:
        # Fall back for older Gradio versions
        app = gr.Blocks(title="CISC 121 Gesture App")
    
    with app:
        
        # ----------------------------------------------------------------------
        # PART A: HEADER SECTION
        # ----------------------------------------------------------------------
        # gr.Markdown() lets us add formatted text using Markdown syntax
        # Markdown is a simple way to format text (like in README files)
        
        gr.Markdown(APP_TITLE)
        gr.Markdown(APP_DESCRIPTION)
        
        # Add a horizontal line for visual separation
        gr.Markdown("---")
        
        # ----------------------------------------------------------------------
        # PART B: IMAGE INPUT AND RESULTS SECTION
        # ----------------------------------------------------------------------
        # gr.Row() puts components side by side (horizontal layout)
        # gr.Column() stacks components on top of each other (vertical layout)
        
        with gr.Row():
            
            # Left column: Image input
            with gr.Column(scale=1):
                gr.Markdown("### πŸ“Έ Image Input")
                
                # Create tabs for different input methods
                # This makes it clearer for users how to provide an image
                with gr.Tabs():
                    
                    # Tab 1: Upload an image file
                    with gr.TabItem("πŸ“ Upload"):
                        upload_input = gr.Image(
                            label="Click to upload or drag an image here",
                            sources=["upload"],
                            type="pil",
                            height=250
                        )
                    
                    # Tab 2: Use webcam (captures on click)
                    with gr.TabItem("πŸ“· Webcam"):
                        webcam_input = gr.Image(
                            label="Click the πŸ“· button below the preview to capture",
                            sources=["webcam"],
                            type="pil",
                            height=250
                            # Note: mirror_webcam removed for compatibility with older Gradio
                        )
                
                # Status indicator - shows when image is ready
                status_display = gr.Markdown("πŸ‘† *Choose a tab above and provide an image*")
                
                # The submit button
                submit_button = gr.Button(
                    value="πŸ” Analyze Image",
                    variant="primary",
                    size="lg"
                )
            
            # Right column: Results
            with gr.Column(scale=1):
                gr.Markdown("### πŸ“Š Results")
                
                # gr.Markdown() can also display dynamic content
                # We'll update this when the user clicks the button
                results_display = gr.Markdown(
                    value="*Upload or capture an image, then click 'Analyze Image' to see results.*"
                )
        
        # ----------------------------------------------------------------------
        # PART C: CONNECTING COMPONENTS (EVENT HANDLING)
        # ----------------------------------------------------------------------
        # Now we connect the inputs to our functions.
        # We have TWO input sources (upload and webcam) that both need to work.
        
        # State variable to store the current image (from either source)
        # gr.State() is a special Gradio component that stores data between interactions
        current_image = gr.State(value=None)
        
        def on_upload(image):
            """Called when user uploads an image."""
            if image is not None:
                return image, "βœ… **Image uploaded!** Click 'Analyze Image' to continue."
            return None, "πŸ‘† *Choose a tab above and provide an image*"
        
        def on_webcam_capture(image):
            """Called when user captures from webcam."""
            if image is not None:
                return image, "βœ… **Photo captured!** Click 'Analyze Image' to continue."
            return None, "πŸ‘† *Choose a tab above and provide an image*"
        
        def on_submit(stored_image):
            """
            This function runs when the user clicks the submit button.
            
            It's called an "event handler" because it handles the click event.
            
            Parameters:
            -----------
            stored_image : PIL.Image
                The image stored from upload or webcam capture.
            
            Returns:
            --------
            str
                Formatted results to display.
            """
            # Check if we have an image
            if stored_image is None:
                return "⚠️ **No image detected!**\n\n**To fix this:**\n\nπŸ“ **Upload Tab:** Click the upload area and select an image file\n\nπŸ“· **Webcam Tab:** Click the camera button (πŸ“·) to capture a photo\n\nThen click 'Analyze Image' again."
            
            # Step 1: Analyze the image
            results, elapsed_time = analyze_image(stored_image)
            
            # Step 2: Format the results nicely
            formatted = format_results(results, elapsed_time)
            
            # Step 3: Return the formatted text (Gradio displays it)
            return formatted
        
        # Connect upload input - when image changes, store it
        upload_input.change(
            fn=on_upload,
            inputs=[upload_input],
            outputs=[current_image, status_display]
        )
        
        # Connect webcam input - when image is captured, store it
        webcam_input.change(
            fn=on_webcam_capture,
            inputs=[webcam_input],
            outputs=[current_image, status_display]
        )
        
        # Connect the button click to analyze the stored image
        submit_button.click(
            fn=on_submit,
            inputs=[current_image],
            outputs=[results_display]
        )
        
        # ----------------------------------------------------------------------
        # PART D: FOOTER
        # ----------------------------------------------------------------------
        gr.Markdown("---")
        gr.Markdown(
            "*Made for CISC 121 at Queen's University* πŸŽ“"
        )
    
    # Return the completed app
    return app


# ==============================================================================
# SECTION 5: RUNNING THE APP
# ==============================================================================
# This is where we actually start the application.
#
# What does if __name__ == "__main__" mean?
#   This checks if we're running this file directly (not importing it).
#   If we run: python hf_gradio_proj.py β†’ this code runs
#   If we import: from hf_gradio_proj import create_app β†’ this code doesn't run
#
# Why is this useful?
#   It lets us use the same file in two ways:
#   1. As a standalone app (run it directly)
#   2. As a module (import functions into other files)
# ==============================================================================

if __name__ == "__main__":
    # Print a welcome message to the terminal
    print("=" * 60)
    print("πŸŽ“ CISC 121 - Gesture Recognition App")
    print("=" * 60)
    print("Starting the application...")
    print("Once ready, open the URL shown below in your browser.")
    print("=" * 60)
    
    # Create the app
    app = create_app()
    
    # Launch the app
    # share=True creates a public URL anyone can access
    # This is useful for sharing with classmates or instructors
    app.launch(share=True)