Spaces:

arshtech
/

hand2Voice1

Runtime error

File size: 3,080 Bytes

import os
os.environ["MEDIAPIPE_DISABLE_GPU"] = "1"

import gradio as gr
import cv2
import numpy as np
import json
import time
from gtts import gTTS
import tempfile
import mediapipe as mp
from mediapipe import solutions

# ---------------- LOAD GESTURES ----------------
with open("gestures_rules.json", "r") as f:
    gesture_data = json.load(f)["gestures"]

# ---------------- MEDIAPIPE (CPU ONLY) ----------------
mp_hands = solutions.hands
hands = mp_hands.Hands(
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7
)

# ---------------- UTIL ----------------
def get_finger_states(hand_landmarks):
    tips = [4, 8, 12, 16, 20]
    pips = [2, 6, 10, 14, 18]
    return [
        1 if hand_landmarks.landmark[t].y <
        hand_landmarks.landmark[p].y else 0
        for t, p in zip(tips, pips)
    ]

def detect_gesture(states):
    for name, rule in gesture_data.items():
        if rule["pattern"] == states:
            return name
    return None

def speak_text(text):
    tts = gTTS(text=text)
    f = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
    tts.save(f.name)
    return f.name

# ---------------- FRAME PROCESS ----------------
def process_frame(frame, sentence, last_char, last_time):
    if frame is None:
        return frame, sentence, last_char, last_time

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)

    if result.multi_hand_landmarks:
        hand = result.multi_hand_landmarks[0]
        states = get_finger_states(hand)
        char = detect_gesture(states)

        now = time.time()
        if char and char != last_char and now - last_time > 1:
            sentence += char
            last_char = char
            last_time = now

    return frame, sentence, last_char, last_time

def clear_text():
    return "", "", 0.0

def speak(sentence):
    return speak_text(sentence) if sentence else None

# ---------------- CSS ----------------
with open("styles.css") as f:
    custom_css = f.read()

# ---------------- UI ----------------
with gr.Blocks(title="Hand2Voice") as demo:

    gr.Markdown("## 🤟 Hand2Voice – Gesture to Speech")

    with gr.Row():
        with gr.Column():
            webcam = gr.Image(
                label="Webcam",
                type="numpy",
                live=True
            )

        with gr.Column():
            output = gr.HTML("<h3>Waiting for gestures...</h3>")
            speak_btn = gr.Button("🔊 Speak")
            clear_btn = gr.Button("🧹 Clear")
            audio = gr.Audio(autoplay=True)

    sentence = gr.State("")
    last_char = gr.State("")
    last_time = gr.State(0.0)

    webcam.change(
        process_frame,
        inputs=[webcam, sentence, last_char, last_time],
        outputs=[webcam, sentence, last_char, last_time]
    ).then(
        lambda s: f"<h2>{s}</h2>",
        sentence,
        output
    )

    clear_btn.click(clear_text, outputs=[sentence, last_char, last_time])
    speak_btn.click(speak, sentence, audio)

demo.launch(css=custom_css)