arshtech commited on
Commit
6b37272
Β·
verified Β·
1 Parent(s): 7e04624

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -98
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import gradio as gr
2
  import cv2
3
  import numpy as np
@@ -5,11 +8,7 @@ import json
5
  import time
6
  from gtts import gTTS
7
  import tempfile
8
- import requests
9
  from PIL import Image
10
- from io import BytesIO
11
-
12
- # βœ… FIXED MEDIAPIPE IMPORT (HF SAFE)
13
  import mediapipe as mp
14
  from mediapipe import solutions
15
 
@@ -17,41 +16,39 @@ from mediapipe import solutions
17
  with open("gestures_rules.json", "r") as f:
18
  gesture_data = json.load(f)["gestures"]
19
 
20
- # ---------------- MEDIAPIPE SETUP ----------------
21
  mp_hands = solutions.hands
22
  hands = mp_hands.Hands(
 
23
  max_num_hands=1,
24
  min_detection_confidence=0.7,
25
  min_tracking_confidence=0.7
26
  )
27
 
28
- # ---------------- UTIL FUNCTIONS ----------------
29
  def get_finger_states(hand_landmarks):
30
  tips = [4, 8, 12, 16, 20]
31
  pips = [2, 6, 10, 14, 18]
32
-
33
- states = []
34
- for tip, pip in zip(tips, pips):
35
- states.append(
36
- 1 if hand_landmarks.landmark[tip].y <
37
- hand_landmarks.landmark[pip].y else 0
38
- )
39
- return states
40
 
41
  def detect_gesture(states):
42
- for name, rule in gesture_data.items():
43
- if rule["pattern"] == states:
44
- return name
45
  return None
46
 
47
  def speak_text(text):
48
  tts = gTTS(text=text)
49
- temp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
50
- tts.save(temp.name)
51
- return temp.name
52
 
53
- # ---------------- FRAME PROCESSING ----------------
54
- def process_frame(frame, sentence, last_char, last_time):
55
  if frame is None:
56
  return frame, sentence, last_char, last_time
57
 
@@ -65,108 +62,57 @@ def process_frame(frame, sentence, last_char, last_time):
65
  char = detect_gesture(states)
66
 
67
  now = time.time()
68
- if (
69
- char
70
- and char != last_char
71
- and now - last_time > 1
72
- and len(char) == 1
73
- ):
74
  sentence += char
75
  last_char = char
76
  last_time = now
77
 
78
  return frame, sentence, last_char, last_time
79
 
80
- # ---------------- BUTTON ACTIONS ----------------
81
  def clear_text():
82
  return "", "", 0.0
83
 
84
  def speak(sentence):
85
- if sentence.strip():
86
- return speak_text(sentence)
87
- return None
88
 
89
- # ---------------- LOAD LOGO ----------------
90
- logo_url = "https://raw.githubusercontent.com/imarshbir/Hand2Voice/main/logo.png"
91
- logo = Image.open(BytesIO(requests.get(logo_url).content))
92
-
93
- # ---------------- LOAD CSS ----------------
94
  with open("styles.css") as f:
95
  custom_css = f.read()
96
 
97
  # ---------------- UI ----------------
98
- with gr.Blocks(css=custom_css, title="Hand2Voice") as demo:
99
-
100
- gr.Image(
101
- value=logo,
102
- show_label=False,
103
- height=100
104
- )
105
 
106
- gr.HTML("""
107
- <div class='mission-box'>
108
- <div class='mission-title'>Bridging Communication Gaps with AI</div>
109
- <div class='mission-text'>
110
- An intelligent system converting hand gestures into speech
111
- in real time for inclusive communication.
112
- </div>
113
- </div>
114
- """)
115
 
116
  with gr.Row():
117
- with gr.Column(scale=1.1):
118
- gr.Markdown("### πŸŽ₯ Live Camera Feed")
119
- webcam = gr.Image(
120
- source="webcam",
121
  streaming=True,
122
- type="numpy"
123
  )
124
 
125
  with gr.Column(scale=1):
126
- gr.Markdown("### πŸ“ Translation Output")
127
- output_box = gr.HTML(
128
- "<div class='output-text'>🀚 Waiting for gestures...</div>"
129
- )
130
 
131
- speak_btn = gr.Button("πŸ”Š Speak Sentence")
132
- clear_btn = gr.Button("🧹 Clear Text")
133
- audio_out = gr.Audio(autoplay=True)
134
 
135
- # ---------------- STATES ----------------
136
- sentence_state = gr.State("")
137
- last_char_state = gr.State("")
138
- last_time_state = gr.State(0.0)
139
-
140
- # ---------------- STREAM ----------------
141
  webcam.stream(
142
- process_frame,
143
- inputs=[webcam, sentence_state, last_char_state, last_time_state],
144
- outputs=[webcam, sentence_state, last_char_state, last_time_state]
145
  ).then(
146
- lambda s: f"<div class='output-text'>{s}</div>",
147
- inputs=sentence_state,
148
- outputs=output_box
149
- )
150
-
151
- # ---------------- BUTTONS ----------------
152
- clear_btn.click(
153
- clear_text,
154
- outputs=[sentence_state, last_char_state, last_time_state]
155
- )
156
-
157
- speak_btn.click(
158
- speak,
159
- inputs=sentence_state,
160
- outputs=audio_out
161
  )
162
 
163
- # ---------------- FOOTER ----------------
164
- gr.HTML("""
165
- <div class='footer-container'>
166
- <div class='footer-title'>Hand2Voice</div>
167
- <div class='footer-tagline'>Powered by Computer Vision & NLP</div>
168
- <div class='footer-tagline'>Made by Arshbir Singh</div>
169
- </div>
170
- """)
171
 
172
- demo.launch()
 
1
+ import os
2
+ os.environ["MEDIAPIPE_DISABLE_GPU"] = "1"
3
+
4
  import gradio as gr
5
  import cv2
6
  import numpy as np
 
8
  import time
9
  from gtts import gTTS
10
  import tempfile
 
11
  from PIL import Image
 
 
 
12
  import mediapipe as mp
13
  from mediapipe import solutions
14
 
 
16
  with open("gestures_rules.json", "r") as f:
17
  gesture_data = json.load(f)["gestures"]
18
 
19
+ # ---------------- MEDIAPIPE (CPU ONLY) ----------------
20
  mp_hands = solutions.hands
21
  hands = mp_hands.Hands(
22
+ static_image_mode=False,
23
  max_num_hands=1,
24
  min_detection_confidence=0.7,
25
  min_tracking_confidence=0.7
26
  )
27
 
28
+ # ---------------- UTIL ----------------
29
  def get_finger_states(hand_landmarks):
30
  tips = [4, 8, 12, 16, 20]
31
  pips = [2, 6, 10, 14, 18]
32
+ return [
33
+ 1 if hand_landmarks.landmark[t].y <
34
+ hand_landmarks.landmark[p].y else 0
35
+ for t, p in zip(tips, pips)
36
+ ]
 
 
 
37
 
38
  def detect_gesture(states):
39
+ for k, v in gesture_data.items():
40
+ if v["pattern"] == states:
41
+ return k
42
  return None
43
 
44
  def speak_text(text):
45
  tts = gTTS(text=text)
46
+ f = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
47
+ tts.save(f.name)
48
+ return f.name
49
 
50
+ # ---------------- VIDEO PROCESS ----------------
51
+ def process_video(frame, sentence, last_char, last_time):
52
  if frame is None:
53
  return frame, sentence, last_char, last_time
54
 
 
62
  char = detect_gesture(states)
63
 
64
  now = time.time()
65
+ if char and char != last_char and now - last_time > 1:
 
 
 
 
 
66
  sentence += char
67
  last_char = char
68
  last_time = now
69
 
70
  return frame, sentence, last_char, last_time
71
 
 
72
  def clear_text():
73
  return "", "", 0.0
74
 
75
  def speak(sentence):
76
+ return speak_text(sentence) if sentence else None
 
 
77
 
78
+ # ---------------- CSS ----------------
 
 
 
 
79
  with open("styles.css") as f:
80
  custom_css = f.read()
81
 
82
  # ---------------- UI ----------------
83
+ with gr.Blocks(title="Hand2Voice") as demo:
 
 
 
 
 
 
84
 
85
+ gr.Markdown("## 🀟 Hand2Voice – Gesture to Speech")
 
 
 
 
 
 
 
 
86
 
87
  with gr.Row():
88
+ with gr.Column(scale=1):
89
+ webcam = gr.Video(
90
+ sources=["webcam"],
 
91
  streaming=True,
92
+ height=360
93
  )
94
 
95
  with gr.Column(scale=1):
96
+ output = gr.HTML("<h3>Waiting for gestures...</h3>")
97
+ speak_btn = gr.Button("πŸ”Š Speak")
98
+ clear_btn = gr.Button("🧹 Clear")
99
+ audio = gr.Audio(autoplay=True)
100
 
101
+ sentence = gr.State("")
102
+ last_char = gr.State("")
103
+ last_time = gr.State(0.0)
104
 
 
 
 
 
 
 
105
  webcam.stream(
106
+ process_video,
107
+ inputs=[webcam, sentence, last_char, last_time],
108
+ outputs=[webcam, sentence, last_char, last_time]
109
  ).then(
110
+ lambda s: f"<h2>{s}</h2>",
111
+ sentence,
112
+ output
 
 
 
 
 
 
 
 
 
 
 
 
113
  )
114
 
115
+ clear_btn.click(clear_text, outputs=[sentence, last_char, last_time])
116
+ speak_btn.click(speak, sentence, audio)
 
 
 
 
 
 
117
 
118
+ demo.launch(css=custom_css)