Spaces:
Runtime error
Runtime error
File size: 3,080 Bytes
6b37272 16412d8 7e04624 16412d8 6b37272 7e04624 16412d8 6b37272 16412d8 6b37272 16412d8 b37ba50 16412d8 6b37272 16412d8 b37ba50 16412d8 7e04624 16412d8 7e04624 16412d8 7e04624 16412d8 6b37272 16412d8 7e04624 16412d8 7e04624 16412d8 7e04624 6b37272 16412d8 6b37272 7e04624 6b37272 7e04624 6b37272 16412d8 b37ba50 7e04624 b37ba50 6b37272 7e04624 6b37272 7e04624 b37ba50 6b37272 7e04624 6b37272 7e04624 6b37272 16412d8 6b37272 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import os
os.environ["MEDIAPIPE_DISABLE_GPU"] = "1"
import gradio as gr
import cv2
import numpy as np
import json
import time
from gtts import gTTS
import tempfile
import mediapipe as mp
from mediapipe import solutions
# ---------------- LOAD GESTURES ----------------
with open("gestures_rules.json", "r") as f:
gesture_data = json.load(f)["gestures"]
# ---------------- MEDIAPIPE (CPU ONLY) ----------------
mp_hands = solutions.hands
hands = mp_hands.Hands(
max_num_hands=1,
min_detection_confidence=0.7,
min_tracking_confidence=0.7
)
# ---------------- UTIL ----------------
def get_finger_states(hand_landmarks):
tips = [4, 8, 12, 16, 20]
pips = [2, 6, 10, 14, 18]
return [
1 if hand_landmarks.landmark[t].y <
hand_landmarks.landmark[p].y else 0
for t, p in zip(tips, pips)
]
def detect_gesture(states):
for name, rule in gesture_data.items():
if rule["pattern"] == states:
return name
return None
def speak_text(text):
tts = gTTS(text=text)
f = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tts.save(f.name)
return f.name
# ---------------- FRAME PROCESS ----------------
def process_frame(frame, sentence, last_char, last_time):
if frame is None:
return frame, sentence, last_char, last_time
frame = cv2.flip(frame, 1)
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
result = hands.process(rgb)
if result.multi_hand_landmarks:
hand = result.multi_hand_landmarks[0]
states = get_finger_states(hand)
char = detect_gesture(states)
now = time.time()
if char and char != last_char and now - last_time > 1:
sentence += char
last_char = char
last_time = now
return frame, sentence, last_char, last_time
def clear_text():
return "", "", 0.0
def speak(sentence):
return speak_text(sentence) if sentence else None
# ---------------- CSS ----------------
with open("styles.css") as f:
custom_css = f.read()
# ---------------- UI ----------------
with gr.Blocks(title="Hand2Voice") as demo:
gr.Markdown("## π€ Hand2Voice β Gesture to Speech")
with gr.Row():
with gr.Column():
webcam = gr.Image(
label="Webcam",
type="numpy",
live=True
)
with gr.Column():
output = gr.HTML("<h3>Waiting for gestures...</h3>")
speak_btn = gr.Button("π Speak")
clear_btn = gr.Button("π§Ή Clear")
audio = gr.Audio(autoplay=True)
sentence = gr.State("")
last_char = gr.State("")
last_time = gr.State(0.0)
webcam.change(
process_frame,
inputs=[webcam, sentence, last_char, last_time],
outputs=[webcam, sentence, last_char, last_time]
).then(
lambda s: f"<h2>{s}</h2>",
sentence,
output
)
clear_btn.click(clear_text, outputs=[sentence, last_char, last_time])
speak_btn.click(speak, sentence, audio)
demo.launch(css=custom_css)
|