Spaces:

arshtech
/

hand2Voice1

Runtime error

App Files Files Community

arshtech commited on 18 days ago

Commit

6b37272

verified ·

1 Parent(s): 7e04624

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -98

app.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import gradio as gr
 import cv2
 import numpy as np
@@ -5,11 +8,7 @@ import json
 import time
 from gtts import gTTS
 import tempfile
-import requests
 from PIL import Image
-from io import BytesIO
-# ✅ FIXED MEDIAPIPE IMPORT (HF SAFE)
 import mediapipe as mp
 from mediapipe import solutions
@@ -17,41 +16,39 @@ from mediapipe import solutions
 with open("gestures_rules.json", "r") as f:
     gesture_data = json.load(f)["gestures"]
-# ---------------- MEDIAPIPE SETUP ----------------
 mp_hands = solutions.hands
 hands = mp_hands.Hands(
     max_num_hands=1,
     min_detection_confidence=0.7,
     min_tracking_confidence=0.7
 )
-# ---------------- UTIL FUNCTIONS ----------------
 def get_finger_states(hand_landmarks):
     tips = [4, 8, 12, 16, 20]
     pips = [2, 6, 10, 14, 18]
-    states = []
-    for tip, pip in zip(tips, pips):
-        states.append(
-            1 if hand_landmarks.landmark[tip].y <
-            hand_landmarks.landmark[pip].y else 0
-        )
-    return states
 def detect_gesture(states):
-    for name, rule in gesture_data.items():
-        if rule["pattern"] == states:
-            return name
     return None
 def speak_text(text):
     tts = gTTS(text=text)
-    temp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
-    tts.save(temp.name)
-    return temp.name
-# ---------------- FRAME PROCESSING ----------------
-def process_frame(frame, sentence, last_char, last_time):
     if frame is None:
         return frame, sentence, last_char, last_time
@@ -65,108 +62,57 @@ def process_frame(frame, sentence, last_char, last_time):
         char = detect_gesture(states)
         now = time.time()
-        if (
-            char
-            and char != last_char
-            and now - last_time > 1
-            and len(char) == 1
-        ):
             sentence += char
             last_char = char
             last_time = now
     return frame, sentence, last_char, last_time
-# ---------------- BUTTON ACTIONS ----------------
 def clear_text():
     return "", "", 0.0
 def speak(sentence):
-    if sentence.strip():
-        return speak_text(sentence)
-    return None
-# ---------------- LOAD LOGO ----------------
-logo_url = "https://raw.githubusercontent.com/imarshbir/Hand2Voice/main/logo.png"
-logo = Image.open(BytesIO(requests.get(logo_url).content))
-# ---------------- LOAD CSS ----------------
 with open("styles.css") as f:
     custom_css = f.read()
 # ---------------- UI ----------------
-with gr.Blocks(css=custom_css, title="Hand2Voice") as demo:
-    gr.Image(
-        value=logo,
-        show_label=False,
-        height=100
-    )
-    gr.HTML("""
-    <div class='mission-box'>
-        <div class='mission-title'>Bridging Communication Gaps with AI</div>
-        <div class='mission-text'>
-            An intelligent system converting hand gestures into speech
-            in real time for inclusive communication.
-        </div>
-    </div>
-    """)
     with gr.Row():
-        with gr.Column(scale=1.1):
-            gr.Markdown("### 🎥 Live Camera Feed")
-            webcam = gr.Image(
-                source="webcam",
                 streaming=True,
-                type="numpy"
             )
         with gr.Column(scale=1):
-            gr.Markdown("### 📝 Translation Output")
-            output_box = gr.HTML(
-                "<div class='output-text'>🤚 Waiting for gestures...</div>"
-            )
-            speak_btn = gr.Button("🔊 Speak Sentence")
-            clear_btn = gr.Button("🧹 Clear Text")
-            audio_out = gr.Audio(autoplay=True)
-    # ---------------- STATES ----------------
-    sentence_state = gr.State("")
-    last_char_state = gr.State("")
-    last_time_state = gr.State(0.0)
-    # ---------------- STREAM ----------------
     webcam.stream(
-        process_frame,
-        inputs=[webcam, sentence_state, last_char_state, last_time_state],
-        outputs=[webcam, sentence_state, last_char_state, last_time_state]
     ).then(
-        lambda s: f"<div class='output-text'>{s}</div>",
-        inputs=sentence_state,
-        outputs=output_box
-    )
-    # ---------------- BUTTONS ----------------
-    clear_btn.click(
-        clear_text,
-        outputs=[sentence_state, last_char_state, last_time_state]
-    )
-    speak_btn.click(
-        speak,
-        inputs=sentence_state,
-        outputs=audio_out
     )
-    # ---------------- FOOTER ----------------
-    gr.HTML("""
-    <div class='footer-container'>
-        <div class='footer-title'>Hand2Voice</div>
-        <div class='footer-tagline'>Powered by Computer Vision & NLP</div>
-        <div class='footer-tagline'>Made by Arshbir Singh</div>
-    </div>
-    """)
-demo.launch()

+import os
+os.environ["MEDIAPIPE_DISABLE_GPU"] = "1"
 import gradio as gr
 import cv2
 import numpy as np
 import time
 from gtts import gTTS
 import tempfile
 from PIL import Image
 import mediapipe as mp
 from mediapipe import solutions
 with open("gestures_rules.json", "r") as f:
     gesture_data = json.load(f)["gestures"]
+# ---------------- MEDIAPIPE (CPU ONLY) ----------------
 mp_hands = solutions.hands
 hands = mp_hands.Hands(
+    static_image_mode=False,
     max_num_hands=1,
     min_detection_confidence=0.7,
     min_tracking_confidence=0.7
 )
+# ---------------- UTIL ----------------
 def get_finger_states(hand_landmarks):
     tips = [4, 8, 12, 16, 20]
     pips = [2, 6, 10, 14, 18]
+    return [
+        1 if hand_landmarks.landmark[t].y <
+        hand_landmarks.landmark[p].y else 0
+        for t, p in zip(tips, pips)
+    ]
 def detect_gesture(states):
+    for k, v in gesture_data.items():
+        if v["pattern"] == states:
+            return k
     return None
 def speak_text(text):
     tts = gTTS(text=text)
+    f = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+    tts.save(f.name)
+    return f.name
+# ---------------- VIDEO PROCESS ----------------
+def process_video(frame, sentence, last_char, last_time):
     if frame is None:
         return frame, sentence, last_char, last_time
         char = detect_gesture(states)
         now = time.time()
+        if char and char != last_char and now - last_time > 1:
             sentence += char
             last_char = char
             last_time = now
     return frame, sentence, last_char, last_time
 def clear_text():
     return "", "", 0.0
 def speak(sentence):
+    return speak_text(sentence) if sentence else None
+# ---------------- CSS ----------------
 with open("styles.css") as f:
     custom_css = f.read()
 # ---------------- UI ----------------
+with gr.Blocks(title="Hand2Voice") as demo:
+    gr.Markdown("## 🤟 Hand2Voice – Gesture to Speech")
     with gr.Row():
+        with gr.Column(scale=1):
+            webcam = gr.Video(
+                sources=["webcam"],
                 streaming=True,
+                height=360
             )
         with gr.Column(scale=1):
+            output = gr.HTML("<h3>Waiting for gestures...</h3>")
+            speak_btn = gr.Button("🔊 Speak")
+            clear_btn = gr.Button("🧹 Clear")
+            audio = gr.Audio(autoplay=True)
+    sentence = gr.State("")
+    last_char = gr.State("")
+    last_time = gr.State(0.0)
     webcam.stream(
+        process_video,
+        inputs=[webcam, sentence, last_char, last_time],
+        outputs=[webcam, sentence, last_char, last_time]
     ).then(
+        lambda s: f"<h2>{s}</h2>",
+        sentence,
+        output
     )
+    clear_btn.click(clear_text, outputs=[sentence, last_char, last_time])
+    speak_btn.click(speak, sentence, audio)
+demo.launch(css=custom_css)