Spaces:

arshtech
/

hand2Voice1

Runtime error

App Files Files Community

hand2Voice1 / app.py

arshtech

Update app.py

b37ba50 verified 3 days ago

raw

history blame contribute delete

3.08 kB

	import os
	os.environ["MEDIAPIPE_DISABLE_GPU"] = "1"

	import gradio as gr
	import cv2
	import numpy as np
	import json
	import time
	from gtts import gTTS
	import tempfile
	import mediapipe as mp
	from mediapipe import solutions

	# ---------------- LOAD GESTURES ----------------
	with open("gestures_rules.json", "r") as f:
	gesture_data = json.load(f)["gestures"]

	# ---------------- MEDIAPIPE (CPU ONLY) ----------------
	mp_hands = solutions.hands
	hands = mp_hands.Hands(
	max_num_hands=1,
	min_detection_confidence=0.7,
	min_tracking_confidence=0.7
	)

	# ---------------- UTIL ----------------
	def get_finger_states(hand_landmarks):
	tips = [4, 8, 12, 16, 20]
	pips = [2, 6, 10, 14, 18]
	return [
	1 if hand_landmarks.landmark[t].y <
	hand_landmarks.landmark[p].y else 0
	for t, p in zip(tips, pips)
	]

	def detect_gesture(states):
	for name, rule in gesture_data.items():
	if rule["pattern"] == states:
	return name
	return None

	def speak_text(text):
	tts = gTTS(text=text)
	f = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
	tts.save(f.name)
	return f.name

	# ---------------- FRAME PROCESS ----------------
	def process_frame(frame, sentence, last_char, last_time):
	if frame is None:
	return frame, sentence, last_char, last_time

	frame = cv2.flip(frame, 1)
	rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	result = hands.process(rgb)

	if result.multi_hand_landmarks:
	hand = result.multi_hand_landmarks[0]
	states = get_finger_states(hand)
	char = detect_gesture(states)

	now = time.time()
	if char and char != last_char and now - last_time > 1:
	sentence += char
	last_char = char
	last_time = now

	return frame, sentence, last_char, last_time

	def clear_text():
	return "", "", 0.0

	def speak(sentence):
	return speak_text(sentence) if sentence else None

	# ---------------- CSS ----------------
	with open("styles.css") as f:
	custom_css = f.read()

	# ---------------- UI ----------------
	with gr.Blocks(title="Hand2Voice") as demo:

	gr.Markdown("## 🤟 Hand2Voice – Gesture to Speech")

	with gr.Row():
	with gr.Column():
	webcam = gr.Image(
	label="Webcam",
	type="numpy",
	live=True
	)

	with gr.Column():
	output = gr.HTML("<h3>Waiting for gestures...</h3>")
	speak_btn = gr.Button("🔊 Speak")
	clear_btn = gr.Button("🧹 Clear")
	audio = gr.Audio(autoplay=True)

	sentence = gr.State("")
	last_char = gr.State("")
	last_time = gr.State(0.0)

	webcam.change(
	process_frame,
	inputs=[webcam, sentence, last_char, last_time],
	outputs=[webcam, sentence, last_char, last_time]
	).then(
	lambda s: f"<h2>{s}</h2>",
	sentence,
	output
	)

	clear_btn.click(clear_text, outputs=[sentence, last_char, last_time])
	speak_btn.click(speak, sentence, audio)

	demo.launch(css=custom_css)