Spaces:

kalpniks
/

ASL_Translator

Sleeping

App Files Files Community

kalpniks commited on 25 days ago

Commit

13b84eb

verified ·

1 Parent(s): 4b02806

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

app.py +102 -4
requirements.txt +3 -0

app.py CHANGED Viewed

@@ -7,6 +7,9 @@ from transformers import AutoImageProcessor, SiglipForImageClassification
 from PIL import Image
 import torch
 import cv2
 os.environ["HF_HOME"] = "/tmp/huggingface"
 os.makedirs("/tmp/huggingface", exist_ok=True)
@@ -33,6 +36,64 @@ labels = {
     "20": "U", "21": "V", "22": "W", "23": "X", "24": "Y", "25": "Z"
 }
 def sign_language_classification_streamlit(video_path):
     print("sign_language_classification_streamlit function called.")
     predicted_letters = []
@@ -84,7 +145,11 @@ def sign_language_classification_streamlit(video_path):
 st.set_page_config(page_title="ASL Translator", layout="centered")
 st.title("ASL Translator")
-st.markdown("Upload a video to translate ASL into one of the 26 sign language alphabet categories and see predictions. ASL Words Translator coming soon!")
 uploaded_file = st.file_uploader("Upload a video file", type=["mp4", "avi", "mov", "webm"])
@@ -95,17 +160,50 @@ if uploaded_file is not None:
         f.write(uploaded_file.getbuffer())
     st.video(video_path)
-    if st.button("Translate ASL"):
         with st.spinner("Translating video... This might take a while depending on video length."):
             realtime_pred, unique_letters = sign_language_classification_streamlit(video_path)
             st.success("Translation Complete!")
-            st.subheader("Last Predicted Sign (Real-time Equivalent)")
             st.write(realtime_pred)
-            st.subheader("Unique Predicted Letters")
             st.write(unique_letters)
         os.remove(video_path) # Clean up temporary file
 else:
     st.info("Please upload a video file to start the translation.")

 from PIL import Image
 import torch
 import cv2
+import numpy as np # Required for opencv and streamlit-webrtc frame processing
+import av # Required for streamlit-webrtc
+from streamlit_webrtc import webrtc_streamer, VideoProcessorBase, ClientSettings
 os.environ["HF_HOME"] = "/tmp/huggingface"
 os.makedirs("/tmp/huggingface", exist_ok=True)
     "20": "U", "21": "V", "22": "W", "23": "X", "24": "Y", "25": "Z"
 }
+# Store model and processor in session state for access by VideoProcessor
+if 'model_obj' not in st.session_state:
+    st.session_state.model_obj = model
+if 'processor_obj' not in st.session_state:
+    st.session_state.processor_obj = processor
+if 'labels_dict' not in st.session_state:
+    st.session_state.labels_dict = labels
+if 'max_consecutive_repetitions_val' not in st.session_state:
+    st.session_state.max_consecutive_repetitions_val = MAX_CONSECUTIVE_REPETITIONS
+# Initialize session state for live predictions if not already present
+if 'live_realtime_pred' not in st.session_state:
+    st.session_state.live_realtime_pred = ""
+if 'live_unique_letters' not in st.session_state:
+    st.session_state.live_unique_letters = ""
+if 'live_predicted_frames_buffer' not in st.session_state:
+    st.session_state.live_predicted_frames_buffer = []
+class SignLanguageVideoProcessor(VideoProcessorBase):
+    def __init__(self):
+        self.model = st.session_state.model_obj
+        self.processor = st.session_state.processor_obj
+        self.labels = st.session_state.labels_dict
+        self.max_consecutive_repetitions = st.session_state.max_consecutive_repetitions_val
+        self.last_predicted_label = None
+        self.consecutive_repetitions = 0
+    def recv(self, frame: av.VideoFrame) -> av.VideoFrame:
+        img_pil = frame.to_image().convert("RGB")
+        inputs = self.processor(images=img_pil, return_tensors="pt")
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            logits = outputs.logits
+        predicted_label_index = torch.argmax(logits, dim=1).item()
+        current_predicted_label = self.labels[str(predicted_label_index)]
+        # Update the buffer of all predicted letters
+        st.session_state.live_predicted_frames_buffer.append(current_predicted_label)
+        # Apply repetition logic for real-time display
+        if current_predicted_label == self.last_predicted_label:
+            self.consecutive_repetitions += 1
+        else:
+            self.consecutive_repetitions = 1
+        if self.consecutive_repetitions > self.max_consecutive_repetitions or self.last_predicted_label is None:
+            st.session_state.live_realtime_pred = current_predicted_label
+            self.last_predicted_label = current_predicted_label
+            # Update unique letters from the buffer
+            unique_preds = list(dict.fromkeys(st.session_state.live_predicted_frames_buffer))
+            st.session_state.live_unique_letters = ", ".join(unique_preds)
+        return frame # Return original frame (or modified frame if drawing text)
 def sign_language_classification_streamlit(video_path):
     print("sign_language_classification_streamlit function called.")
     predicted_letters = []
 st.set_page_config(page_title="ASL Translator", layout="centered")
 st.title("ASL Translator")
+st.markdown("Upload a video or use your webcam to translate ASL into one of the 26 sign language alphabet categories and see predictions. ASL Words Translator coming soon!")
+# --- Section for Uploaded Video ---
+st.subheader("Translate from Uploaded Video")
 uploaded_file = st.file_uploader("Upload a video file", type=["mp4", "avi", "mov", "webm"])
         f.write(uploaded_file.getbuffer())
     st.video(video_path)
+    if st.button("Translate ASL (from file)"):
         with st.spinner("Translating video... This might take a while depending on video length."):
             realtime_pred, unique_letters = sign_language_classification_streamlit(video_path)
             st.success("Translation Complete!")
+            st.subheader("Last Predicted Sign (from file)")
             st.write(realtime_pred)
+            st.subheader("Unique Predicted Letters (from file)")
             st.write(unique_letters)
         os.remove(video_path) # Clean up temporary file
 else:
     st.info("Please upload a video file to start the translation.")
+st.markdown("--- # ---
+")
+# --- Section for Live Webcam ---
+st.subheader("Live ASL Translation from Webcam")
+# Placeholders for live updates
+live_realtime_placeholder = st.empty()
+live_unique_letters_placeholder = st.empty()
+webrtc_ctx = webrtc_streamer(
+    key="webrtc_asl",
+    mode="sendrecv",
+    # rtc_configuration=ClientSettings( # Removed for broader compatibility, relies on default STUN/TURN
+    #     rtc_configuration={"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
+    # ).rtc_configuration,
+    video_processor_factory=SignLanguageVideoProcessor,
+    media_stream_constraints={"video": True, "audio": False},
+    async_processing=True,
+)
+if webrtc_ctx.state.playing:
+    # Update placeholders based on session state. These will update on each rerun triggered by session_state changes.
+    live_realtime_placeholder.markdown(f"**Real-time Prediction:** {st.session_state.live_realtime_pred}")
+    live_unique_letters_placeholder.markdown(f"**Unique Predicted Letters:** {st.session_state.live_unique_letters}")
+else:
+    # Reset session state when webcam is not playing
+    if st.session_state.live_realtime_pred != "" or st.session_state.live_unique_letters != "":
+        st.session_state.live_realtime_pred = ""
+        st.session_state.live_unique_letters = ""
+        st.session_state.live_predicted_frames_buffer = []
+    st.info("Click 'Start' to begin live ASL translation from your webcam.")

requirements.txt CHANGED Viewed

@@ -4,3 +4,6 @@ opencv-python-headless
 transformers
 torch
 Pillow

 transformers
 torch
 Pillow
+streamlit_webrtc
+pyav
+numpy