Spaces:

kalpniks
/

ASL_Translator

Sleeping

App Files Files Community

kalpniks commited on Nov 20, 2025

Commit

69f9775

verified ·

1 Parent(s): 39068b7

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

Dockerfile +2 -2
app.py +113 -175
requirements.txt +3 -5

Dockerfile CHANGED Viewed

@@ -12,5 +12,5 @@ COPY . /app
 EXPOSE 7860
-# CMD for Streamlit application
-CMD ["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]

 EXPOSE 7860
+# CMD for Gradio application
+CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -1,206 +1,144 @@
-import streamlit as st
-import os
 from collections import Counter
 import time
 import traceback
-from transformers import AutoImageProcessor, SiglipForImageClassification
 from PIL import Image
 import torch
-import cv2
-import numpy as np
-import av
-from streamlit_webrtc import webrtc_streamer, VideoProcessorBase, WebRtcMode
-# ClientSettings import removed as it was causing issues, using dict directly for rtc_configuration
-os.environ["HF_HOME"] = "/tmp/huggingface"
-os.makedirs("/tmp/huggingface", exist_ok=True)
-# Load model and processor
 model_name = "prithivMLmods/Alphabet-Sign-Language-Detection"
-@st.cache_resource
-def load_model_and_processor():
-    print(f"INFO: Loading model '{model_name}'...")
-    model = SiglipForImageClassification.from_pretrained(model_name)
-    processor = AutoImageProcessor.from_pretrained(model_name)
-    print("INFO: Model and processor loaded successfully.")
-    return model, processor
-# Call the cached resource loader once and store in global variables
-model, processor = load_model_and_processor()
-# Define the maximum number of consecutive repetitions allowed for predictions (global constant)
-MAX_CONSECUTIVE_REPETITIONS = 3
-# Define labels (global constant)
-labels = {
-    "0": "A", "1": "B", "2": "C", "3": "D", "4": "E", "5": "F", "6": "G", "7": "H", "8": "I", "9": "J",
-    "10": "K", "11": "L", "12": "M", "13": "N", "14": "O", "15": "P", "16": "Q", "17": "R", "18": "S", "19": "T",
-    "20": "U", "21": "V", "22": "W", "23": "X", "24": "Y", "25": "Z"
-}
-# Initialize session state for live predictions if not already present
-# These are the only session state variables that need to be dynamic
-if 'live_realtime_pred' not in st.session_state:
-    st.session_state.live_realtime_pred = ""
-if 'live_unique_letters' not in st.session_state:
-    st.session_state.live_unique_letters = ""
-if 'live_predicted_frames_buffer' not in st.session_state:
-    st.session_state.live_predicted_frames_buffer = []
-class SignLanguageVideoProcessor(VideoProcessorBase):
-    def __init__(self):
-        # Directly use the global variables (which are cached resources or constants)
-        self.model = model
-        self.processor = processor
-        self.labels = labels
-        self.max_consecutive_repetitions = MAX_CONSECUTIVE_REPETITIONS
-        self.last_predicted_label = None
-        self.consecutive_repetitions = 0
-    def recv(self, frame: av.VideoFrame) -> av.VideoFrame:
-        img_pil = frame.to_image().convert("RGB")
-        inputs = self.processor(images=img_pil, return_tensors="pt")
-        with torch.no_grad():
-            outputs = self.model(**inputs)
-            logits = outputs.logits
-        predicted_label_index = torch.argmax(logits, dim=1).item()
-        current_predicted_label = self.labels[str(predicted_label_index)]
-        # Update the buffer of all predicted letters
-        st.session_state.live_predicted_frames_buffer.append(current_predicted_label)
-        # Apply repetition logic for real-time display
-        if current_predicted_label == self.last_predicted_label:
-            self.consecutive_repetitions += 1
-        else:
-            self.consecutive_repetitions = 1
-        if self.consecutive_repetitions > self.max_consecutive_repetitions or self.last_predicted_label is None:
-            st.session_state.live_realtime_pred = current_predicted_label
-            self.last_predicted_label = current_predicted_label
-            # Update unique letters from the buffer
-            unique_preds = list(dict.fromkeys(st.session_state.live_predicted_frames_buffer))
-            st.session_state.live_unique_letters = ", ".join(unique_preds)
-        return frame # Return original frame (or modified frame if drawing text)
-def sign_language_classification_streamlit(video_path):
-    print("sign_language_classification_streamlit function called.")
-    predicted_letters = []
-    last_predicted_label = None
-    consecutive_repetitions = 0
-    # Access model, processor, labels, and MAX_CONSECUTIVE_REPETITIONS from global scope
-    local_model = model
-    local_processor = processor
-    local_labels = labels
-    local_max_consecutive_repetitions = MAX_CONSECUTIVE_REPETITIONS
     try:
-        cap = cv2.VideoCapture(video_path)
-        if not cap.isOpened():
-            return "Error: Could not open video file.", ""
-        while True:
-            ret, frame = cap.read()
-            if not ret:
-                break
-            image = Image.fromarray(frame).convert("RGB")
-            inputs = local_processor(images=image, return_tensors="pt")
-            with torch.no_grad():
-                outputs = local_model(**inputs)
-                logits = outputs.logits
-            predicted_label_index = torch.argmax(logits, dim=1).item()
-            current_predicted_label = local_labels[str(predicted_label_index)]
-            # Apply repetition logic
-            if current_predicted_label == last_predicted_label:
-                consecutive_repetitions += 1
-            else:
-                consecutive_repetitions = 1
-            if consecutive_repetitions > local_max_consecutive_repetitions or last_predicted_label is None:
-                predicted_letters.append(current_predicted_label)
-                last_predicted_label = current_predicted_label
-        cap.release()
-        unique_predicted_letters = list(dict.fromkeys(predicted_letters))
-        final_output_str = ", ".join(unique_predicted_letters)
-        realtime_equivalent_prediction = unique_predicted_letters[-1] if unique_predicted_letters else ""
-        return realtime_equivalent_prediction, final_output_str
-    except Exception as e:
-        print(f"Error caught: {e}")
-        error_msg = f"Error processing video: {e}"
-        full_traceback_flat = traceback.format_exc().replace('\n', ' | ').replace('\r', '')
-        return error_msg, f"{{error_msg}} (Details: {{full_traceback_flat}})"
-st.set_page_config(page_title="ASL Translator", layout="centered")
-st.title("ASL Translator")
-st.markdown("Upload a video or use your webcam to translate ASL into one of the 26 sign language alphabet categories and see predictions. ASL Words Translator coming soon!")
-# --- Section for Uploaded Video ---
-st.subheader("Translate from Uploaded Video")
-uploaded_file = st.file_uploader("Upload a video file", type=["mp4", "avi", "mov", "webm"])
-if uploaded_file is not None:
-    # Save the uploaded file temporarily
-    video_path = os.path.join("/tmp", uploaded_file.name)
-    with open(video_path, "wb") as f:
-        f.write(uploaded_file.getbuffer())
-    st.video(video_path)
-    if st.button("Translate ASL (from file)"):
-        with st.spinner("Translating video... This might take a while depending on video length."):
-            realtime_pred, unique_letters = sign_language_classification_streamlit(video_path)
-            st.success("Translation Complete!")
-            st.subheader("Last Predicted Sign (from file)")
-            st.write(realtime_pred)
-            st.subheader("Unique Predicted Letters (from file)")
-            st.write(unique_letters)
-        os.remove(video_path) # Clean up temporary file
-else:
-    st.info("Please upload a video file to start the translation.")
-# The line st.markdown("--- # ---") was causing a SyntaxError, temporarily removed for testing.
-# --- Section for Live Webcam ---
-st.subheader("Live ASL Translation from Webcam")
-# Placeholders for live updates
-live_realtime_placeholder = st.empty()
-live_unique_letters_placeholder = st.empty()
-webrtc_ctx = webrtc_streamer(
-    key="webrtc_asl",
-    mode=WebRtcMode.SENDRECV,
-    rtc_configuration={
-        "iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]
-    }, # Directly providing the dict
-    video_processor_factory=SignLanguageVideoProcessor,
-    media_stream_constraints={"video": True, "audio": False},
-    async_processing=True,
 )
-if webrtc_ctx.state.playing:
-    # Update placeholders based on session state. These will update on each rerun triggered by session_state changes.
-    live_realtime_placeholder.markdown(f"**Real-time Prediction:** {st.session_state.live_realtime_pred}")
-    live_unique_letters_placeholder.markdown(f"**Unique Predicted Letters:** {st.session_state.live_unique_letters}")
-else:
-    # Reset session state when webcam is not playing
-    if st.session_state.live_realtime_pred != "" or st.session_state.live_unique_letters != "":
-        st.session_state.live_realtime_pred = ""
-        st.session_state.live_unique_letters = ""
-        st.session_state.live_predicted_frames_buffer = []
-    st.info("Click 'Start' to begin live ASL translation from your webcam.")

+# Import necessary libraries
 from collections import Counter
 import time
 import traceback
+import gradio as gr
+from transformers import AutoImageProcessor
+from transformers import SiglipForImageClassification
+from transformers.image_utils import load_image
 from PIL import Image
 import torch
+import cv2 # Import cv2 for video frame processing
+# Load model and processor for Alphabet Sign Language Detection
 model_name = "prithivMLmods/Alphabet-Sign-Language-Detection"
+model = SiglipForImageClassification.from_pretrained(model_name)
+processor = AutoImageProcessor.from_pretrained(model_name)
+# Define the maximum number of consecutive repetitions allowed for predictions
+MAX_CONSECUTIVE_REPETITIONS = 3
+def sign_language_classification(video):
+    """
+    Predicts sign language alphabet category for each frame in a video,
+    yields predictions in real-time with repetition handling, and returns a list of unique predicted letters.
+    """
+    print("sign_language_classification function called.") # Debug print to indicate function call
+    if video is None:
+        print("No video provided.") # Debug print if no video input
+        yield "No video provided.", "" # Yield empty string for the second output if no video
+        return
+    print(f"Video input type: {type(video)}") # Debug print to show video input type
+    print(f"Video value: {video}") # Debug print to show video input value
+    predicted_letters = [] # List to store all predicted letters from each frame
+    last_predicted_label = None # Initialize variable to store the last predicted label to handle repetitions
+    consecutive_repetitions = 0 # Initialize counter for consecutive repetitions of the same prediction
     try:
+        print("Starting frame processing loop.") # Debug print to indicate start of frame processing
+        frames = []
+        if isinstance(video, str):
+            # If video is a filepath (e.g., uploaded file), load the video frames using OpenCV
+            cap = cv2.VideoCapture(video)
+            if not cap.isOpened():
+                yield "Error: Could not open video file.", "" # Yield error if video file cannot be opened
+                return
+            while True:
+                ret, frame = cap.read()
+                if not ret: # Break the loop if no more frames are returned
+                    break
+                frames.append(frame) # Append the read frame to the frames list
+            cap.release() # Release the video capture object
+        elif isinstance(video, list):
+             # If video is already a list of frames (e.g., from webcam in some Gradio versions)
+             frames = video
+        else:
+             yield "Error: Unsupported video input type.", "" # Yield error for unsupported video input types
+             return
+        for i, frame in enumerate(frames):
+            # print(f"Processing frame {i}") # Debug print - Removed for cleaner output
+            # Convert the numpy frame (BGR format from OpenCV) to a PIL Image in RGB format for the model
+            image = Image.fromarray(frame).convert("RGB")
+            # print(f"Frame {i} converted to PIL Image.") # Debug print - Removed for cleaner output
+            # Process the image frame using the pre-trained processor and model
+            inputs = processor(images=image, return_tensors="pt") # Prepare image for model input
+            # print(f"Frame {i} processed by processor.") # Debug print - Removed for cleaner output
+            # Perform inference with the model
+            with torch.no_grad(): # Disable gradient calculation for inference
+                outputs = model(**inputs)
+                logits = outputs.logits # Get the raw output scores (logits)
+                probs = torch.nn.functional.softmax(logits, dim=1).squeeze().tolist() # Apply softmax to get probabilities and convert to list
+                # print(f"Frame {i} processed by model. Logits shape: {logits.shape}") # Debug print - Removed for cleaner output
+            # Define the labels mapping model output indices to ASL alphabet letters
+            labels = {
+                "0": "A", "1": "B", "2": "C", "3": "D", "4": "E", "5": "F", "6": "G", "7": "H", "8": "I", "9": "J",
+                "10": "K", "11": "L", "12": "M", "13": "N", "14": "O", "15": "P", "16": "Q", "17": "R", "18": "S", "19": "T",
+                "20": "U", "21": "V", "22": "W", "23": "X", "24": "Y", "25": "Z"
+            }
+            # Get the index of the highest probability and find the corresponding predicted label
+            predicted_label_index = probs.index(max(probs))
+            predicted_label = labels[str(predicted_label_index)]
+            # print(f"Frame {i} prediction: {predicted_label}") # Debug print - Removed for cleaner output
+            predicted_letters.append(predicted_label) # Append predicted letter to the list of all predictions
+            # Check for consecutive repetitions and yield only if the rule is met
+            if predicted_label == last_predicted_label:
+                consecutive_repetitions += 1
+            else:
+                consecutive_repetitions = 1 # Reset consecutive count if prediction changes
+            # Yield the prediction if it's not a consecutive repetition beyond the limit or if it's the first prediction
+            if consecutive_repetitions > MAX_CONSECUTIVE_REPETITIONS or last_predicted_label is None:
+                 yield predicted_label, "" # Yield real-time prediction and empty string for the second output
+                 last_predicted_label = predicted_label # Update the last predicted label
+        print("Finished frame processing loop.") # Debug print to indicate end of frame processing
+        # Get unique predicted letters while maintaining order of appearance
+        unique_predicted_letters = list(dict.fromkeys(predicted_letters))
+        final_output = ", ".join(unique_predicted_letters) # Join unique letters into a comma-separated string
+        # Yield the last predicted label (or empty string if none) and the final list of unique letters
+        yield last_predicted_label if last_predicted_label is not None else "", final_output
+    except Exception as e:
+        print(f"Error caught: {e}") # Debug print if an error occurs
+        # Yield error message and traceback information in case of an exception
+        yield f"Error processing video: {e}", f"Error processing video: {e}
+{traceback.format_exc()}"
+# Custom CSS for styling (commented out)
+# custom_css = """
+# body {
+#   background-color: #add8e6;
+# }
+# """
+# Create Gradio interface with video input and multiple outputs
+iface = gr.Interface(
+    fn=sign_language_classification, # The function to run when the user interacts with the interface
+    inputs=gr.Video(sources=["upload", "webcam"]), # Input component: Video, allowing upload or webcam
+    outputs=[
+        gr.Label(label="Real-time Prediction"), # Output component: Label to display the real-time prediction
+        gr.Textbox(label="Unique Predicted Letters") # Output component: Textbox to display the final list of unique predicted letters
+    ],
+    title="ASL Translator", # Title of the Gradio interface
+    description="Upload a video or use your webcam to translate ASL into one of the 26 sign language alphabet categories and see predictions in real-time and a summary list. ASL Words Translator coming soon!", # Description displayed below the title
+    # css=custom_css # Apply custom CSS (commented out)
 )
+# Launch the Gradio app
+if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt CHANGED Viewed

@@ -1,10 +1,8 @@
-streamlit
 opencv-python-headless
-transformers==4.40.1
 torch
 Pillow
-streamlit_webrtc
-av==12.0.0
 numpy
-# huggingface_hub==0.20.0 # Removed pinning to let transformers choose

+gradio
 opencv-python-headless
+transformers
 torch
 Pillow
+av
 numpy