hand2Voice1 / app.py
arshtech's picture
Update app.py
b37ba50 verified
import os
os.environ["MEDIAPIPE_DISABLE_GPU"] = "1"
import gradio as gr
import cv2
import numpy as np
import json
import time
from gtts import gTTS
import tempfile
import mediapipe as mp
from mediapipe import solutions
# ---------------- LOAD GESTURES ----------------
with open("gestures_rules.json", "r") as f:
gesture_data = json.load(f)["gestures"]
# ---------------- MEDIAPIPE (CPU ONLY) ----------------
mp_hands = solutions.hands
hands = mp_hands.Hands(
max_num_hands=1,
min_detection_confidence=0.7,
min_tracking_confidence=0.7
)
# ---------------- UTIL ----------------
def get_finger_states(hand_landmarks):
tips = [4, 8, 12, 16, 20]
pips = [2, 6, 10, 14, 18]
return [
1 if hand_landmarks.landmark[t].y <
hand_landmarks.landmark[p].y else 0
for t, p in zip(tips, pips)
]
def detect_gesture(states):
for name, rule in gesture_data.items():
if rule["pattern"] == states:
return name
return None
def speak_text(text):
tts = gTTS(text=text)
f = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tts.save(f.name)
return f.name
# ---------------- FRAME PROCESS ----------------
def process_frame(frame, sentence, last_char, last_time):
if frame is None:
return frame, sentence, last_char, last_time
frame = cv2.flip(frame, 1)
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
result = hands.process(rgb)
if result.multi_hand_landmarks:
hand = result.multi_hand_landmarks[0]
states = get_finger_states(hand)
char = detect_gesture(states)
now = time.time()
if char and char != last_char and now - last_time > 1:
sentence += char
last_char = char
last_time = now
return frame, sentence, last_char, last_time
def clear_text():
return "", "", 0.0
def speak(sentence):
return speak_text(sentence) if sentence else None
# ---------------- CSS ----------------
with open("styles.css") as f:
custom_css = f.read()
# ---------------- UI ----------------
with gr.Blocks(title="Hand2Voice") as demo:
gr.Markdown("## 🀟 Hand2Voice – Gesture to Speech")
with gr.Row():
with gr.Column():
webcam = gr.Image(
label="Webcam",
type="numpy",
live=True
)
with gr.Column():
output = gr.HTML("<h3>Waiting for gestures...</h3>")
speak_btn = gr.Button("πŸ”Š Speak")
clear_btn = gr.Button("🧹 Clear")
audio = gr.Audio(autoplay=True)
sentence = gr.State("")
last_char = gr.State("")
last_time = gr.State(0.0)
webcam.change(
process_frame,
inputs=[webcam, sentence, last_char, last_time],
outputs=[webcam, sentence, last_char, last_time]
).then(
lambda s: f"<h2>{s}</h2>",
sentence,
output
)
clear_btn.click(clear_text, outputs=[sentence, last_char, last_time])
speak_btn.click(speak, sentence, audio)
demo.launch(css=custom_css)