Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -1,178 +1,19 @@
|
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
os.environ["HF_HOME"] = "/tmp/huggingface"
|
| 3 |
os.makedirs("/tmp/huggingface", exist_ok=True)
|
| 4 |
|
| 5 |
-
print("INFO: app
|
| 6 |
-
|
| 7 |
-
# Import necessary libraries
|
| 8 |
-
print("INFO: Importing collections...")
|
| 9 |
-
from collections import Counter
|
| 10 |
-
print("INFO: Importing time...")
|
| 11 |
-
import time
|
| 12 |
-
print("INFO: Importing traceback...")
|
| 13 |
-
import traceback
|
| 14 |
-
print("INFO: Importing gradio...")
|
| 15 |
-
import gradio as gr
|
| 16 |
-
print("INFO: Importing transformers.AutoImageProcessor...")
|
| 17 |
-
from transformers import AutoImageProcessor
|
| 18 |
-
print("INFO: Importing transformers.SiglipForImageClassification...")
|
| 19 |
-
from transformers import SiglipForImageClassification
|
| 20 |
-
print("INFO: Importing transformers.image_utils.load_image...")
|
| 21 |
-
from transformers.image_utils import load_image
|
| 22 |
-
print("INFO: Importing PIL.Image...")
|
| 23 |
-
from PIL import Image
|
| 24 |
-
print("INFO: Importing torch...")
|
| 25 |
-
import torch
|
| 26 |
-
print("INFO: Importing cv2...")
|
| 27 |
-
import cv2 # Import cv2 for video frame processing
|
| 28 |
-
|
| 29 |
-
print("INFO: All libraries imported successfully.")
|
| 30 |
-
|
| 31 |
-
print("INFO: Loading model and processor.")
|
| 32 |
-
# Load model and processor for Alphabet Sign Language Detection
|
| 33 |
-
model_name = "prithivMLmods/Alphabet-Sign-Language-Detection"
|
| 34 |
-
print(f"INFO: Loading model '{model_name}'...")
|
| 35 |
-
model = SiglipForImageClassification.from_pretrained(model_name)
|
| 36 |
-
processor = AutoImageProcessor.from_pretrained(model_name)
|
| 37 |
-
print("INFO: Model and processor loaded successfully.")
|
| 38 |
-
|
| 39 |
-
# Define the maximum number of consecutive repetitions allowed for predictions
|
| 40 |
-
MAX_CONSECUTIVE_REPETITIONS = 3
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
def sign_language_classification(video):
|
| 44 |
-
"""
|
| 45 |
-
Predicts sign language alphabet category for each frame in a video,
|
| 46 |
-
yields predictions in real-time with repetition handling, and returns a list of unique predicted letters.
|
| 47 |
-
"""
|
| 48 |
-
print("sign_language_classification function called.") # Debug print to indicate function call
|
| 49 |
-
if video is None:
|
| 50 |
-
print("No video provided.") # Debug print if no video input
|
| 51 |
-
print(f"DEBUG: Yielding 'No video provided.' and '', types: {type('No video provided.')}, {type('')}")
|
| 52 |
-
yield "No video provided.", "" # Yield empty string for the second output if no video
|
| 53 |
-
return
|
| 54 |
-
|
| 55 |
-
print(f"Video input type: {type(video)}") # Debug print to show video input type
|
| 56 |
-
print(f"Video value: {video}") # Debug print to show video input value
|
| 57 |
-
|
| 58 |
-
predicted_letters = [] # List to store all predicted letters from each frame
|
| 59 |
-
last_predicted_label = None # Initialize variable to store the last predicted label to handle repetitions
|
| 60 |
-
consecutive_repetitions = 0 # Initialize counter for consecutive repetitions of the same prediction
|
| 61 |
-
|
| 62 |
-
try:
|
| 63 |
-
print("Starting frame processing loop.") # Debug print to indicate start of frame processing
|
| 64 |
-
frames = []
|
| 65 |
-
if isinstance(video, str):
|
| 66 |
-
# If video is a filepath (e.g., uploaded file), load the video frames using OpenCV
|
| 67 |
-
cap = cv2.VideoCapture(video)
|
| 68 |
-
if not cap.isOpened():
|
| 69 |
-
print(f"DEBUG: Yielding 'Error: Could not open video file.' and '', types: {type('Error: Could not open video file.')}, {type('')}")
|
| 70 |
-
yield "Error: Could not open video file.", "" # Yield error if video file cannot be opened
|
| 71 |
-
return
|
| 72 |
-
while True:
|
| 73 |
-
ret, frame = cap.read()
|
| 74 |
-
if not ret: # Break the loop if no more frames are returned
|
| 75 |
-
break
|
| 76 |
-
frames.append(frame) # Append the read frame to the frames list
|
| 77 |
-
cap.release() # Release the video capture object
|
| 78 |
-
elif isinstance(video, list):
|
| 79 |
-
# If video is already a list of frames (e.g., from webcam in some Gradio versions)
|
| 80 |
-
frames = video
|
| 81 |
-
else:
|
| 82 |
-
print(f"DEBUG: Yielding 'Error: Unsupported video input type.' and '', types: {type('Error: Unsupported video input type.')}, {type('')}")
|
| 83 |
-
yield "Error: Unsupported video input type.", "" # Yield error for unsupported video input types
|
| 84 |
-
return
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
for i, frame in enumerate(frames):
|
| 88 |
-
# print(f"Processing frame {i}") # Debug print - Removed for cleaner output
|
| 89 |
-
|
| 90 |
-
# Convert the numpy frame (BGR format from OpenCV) to a PIL Image in RGB format for the model
|
| 91 |
-
image = Image.fromarray(frame).convert("RGB")
|
| 92 |
-
# print(f"Frame {i} converted to PIL Image.") # Debug print - Removed for cleaner output
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
# Process the image frame using the pre-trained processor and model
|
| 96 |
-
inputs = processor(images=image, return_tensors="pt") # Prepare image for model input
|
| 97 |
-
# print(f"Frame {i} processed by processor.)" # Debug print - Removed for cleaner output
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
# Perform inference with the model
|
| 101 |
-
with torch.no_grad(): # Disable gradient calculation for inference
|
| 102 |
-
outputs = model(**inputs)
|
| 103 |
-
logits = outputs.logits # Get the raw output scores (logits)
|
| 104 |
-
# probs = torch.nn.functional.softmax(logits, dim=1).squeeze().tolist() # Apply softmax to get probabilities and convert to list
|
| 105 |
-
# print(f"Frame {i} processed by model. Logits shape: {logits.shape}") # Debug print - Removed for cleaner output
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
# Define the labels mapping model output indices to ASL alphabet letters
|
| 109 |
-
labels = {
|
| 110 |
-
"0": "A", "1": "B", "2": "C", "3": "D", "4": "E", "5": "F", "6": "G", "7": "H", "8": "I", "9": "J",
|
| 111 |
-
"10": "K", "11": "L", "12": "M", "13": "N", "14": "O", "15": "P", "16": "Q", "17": "R", "18": "S", "19": "T",
|
| 112 |
-
"20": "U", "21": "V", "22": "W", "23": "X", "24": "Y", "25": "Z"
|
| 113 |
-
}
|
| 114 |
-
# Get the index of the highest probability directly from logits
|
| 115 |
-
predicted_label_index = torch.argmax(logits, dim=1).item()
|
| 116 |
-
predicted_label = labels[str(predicted_label_index)]
|
| 117 |
-
# print(f"Frame {i} prediction: {predicted_label}") # Debug print - Removed for cleaner output
|
| 118 |
-
|
| 119 |
-
predicted_letters.append(predicted_label) # Append predicted letter to the list of all predictions
|
| 120 |
-
|
| 121 |
-
# Check for consecutive repetitions and yield only if the rule is met
|
| 122 |
-
if predicted_label == last_predicted_label:
|
| 123 |
-
consecutive_repetitions += 1
|
| 124 |
-
else:
|
| 125 |
-
consecutive_repetitions = 1 # Reset consecutive count if prediction changes
|
| 126 |
-
|
| 127 |
-
# Yield the prediction if it's not a consecutive repetition beyond the limit or if it's the first prediction
|
| 128 |
-
if consecutive_repetitions > MAX_CONSECUTIVE_REPETITIONS or last_predicted_label is None:
|
| 129 |
-
print(f"DEBUG: Yielding predicted_label: {predicted_label}, type: {type(predicted_label)}")
|
| 130 |
-
yield predicted_label, "" # Yield real-time prediction and empty string for the second output
|
| 131 |
-
last_predicted_label = predicted_label # Update the last predicted label
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
print("Finished frame processing loop.") # Debug print to indicate end of frame processing
|
| 135 |
-
# Get unique predicted letters while maintaining order of appearance
|
| 136 |
-
unique_predicted_letters = list(dict.fromkeys(predicted_letters))
|
| 137 |
-
final_output = ", ".join(unique_predicted_letters) # Join unique letters into a comma-separated string
|
| 138 |
-
# Yield the last predicted label (or empty string if none) and the final list of unique letters
|
| 139 |
-
output1 = last_predicted_label if last_predicted_label is not None else ""
|
| 140 |
-
output2 = final_output
|
| 141 |
-
print(f"DEBUG: Final yield outputs - output1: {output1}, type: {type(output1)}; output2: {output2}, type: {type(output2)}")
|
| 142 |
-
yield output1, output2
|
| 143 |
-
|
| 144 |
-
except Exception as e:
|
| 145 |
-
print(f"Error caught: {e}") # Debug print if an error occurs
|
| 146 |
-
# Yield error message and traceback information in case of an exception
|
| 147 |
-
error_message = f"Error processing video: {e}"
|
| 148 |
-
traceback_info = f"Error processing video: {e}\n{traceback.format_exc()}"
|
| 149 |
-
print(f"DEBUG: Error yield outputs - error_message: {error_message}, type: {type(error_message)}; traceback_info: {traceback_info}, type: {type(traceback_info)}")
|
| 150 |
-
yield error_message, traceback_info
|
| 151 |
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
# background-color: #add8e6;
|
| 156 |
-
# }
|
| 157 |
-
# """
|
| 158 |
|
| 159 |
-
|
| 160 |
-
# Create Gradio interface with video input and multiple outputs
|
| 161 |
-
iface = gr.Interface(
|
| 162 |
-
fn=sign_language_classification, # The function to run when the user interacts with the interface
|
| 163 |
-
inputs=gr.Video(), # Input component: Video, allowing upload or webcam
|
| 164 |
-
outputs=[
|
| 165 |
-
gr.Label(label="Real-time Prediction"), # Output component: Label to display the real-time prediction
|
| 166 |
-
gr.Textbox(label="Unique Predicted Letters") # Output component: Textbox to display the final list of unique predicted letters
|
| 167 |
-
],
|
| 168 |
-
title="ASL Translator", # Title of the Gradio interface
|
| 169 |
-
description="Upload a video or use your webcam to translate ASL into one of the 26 sign language alphabet categories and see predictions in real-time and a summary list. ASL Words Translator coming soon!", # Description displayed below the title
|
| 170 |
-
# css=custom_css # Apply custom CSS (commented out)
|
| 171 |
-
)
|
| 172 |
-
print("INFO: Gradio interface created.")
|
| 173 |
|
| 174 |
-
# Launch the Gradio app
|
| 175 |
if __name__ == "__main__":
|
| 176 |
-
print("INFO: Attempting to launch Gradio app.")
|
| 177 |
iface.queue().launch(share=True)
|
| 178 |
-
print("INFO: Gradio app launch command executed
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import time
|
| 3 |
import os
|
| 4 |
+
|
| 5 |
os.environ["HF_HOME"] = "/tmp/huggingface"
|
| 6 |
os.makedirs("/tmp/huggingface", exist_ok=True)
|
| 7 |
|
| 8 |
+
print("INFO: Simple Gradio app starting.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
+
def greet(name):
|
| 11 |
+
time.sleep(1)
|
| 12 |
+
return "Hello " + name + "!"
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
iface = gr.Interface(fn=greet, inputs="text", outputs="text", title="Minimal Gradio App")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
|
|
|
| 16 |
if __name__ == "__main__":
|
| 17 |
+
print("INFO: Attempting to launch simple Gradio app.")
|
| 18 |
iface.queue().launch(share=True)
|
| 19 |
+
print("INFO: Simple Gradio app launch command executed.")
|