File size: 3,080 Bytes
6b37272
 
 
16412d8
 
 
 
 
 
 
7e04624
 
 
 
16412d8
 
 
6b37272
7e04624
 
 
 
 
 
16412d8
6b37272
16412d8
 
 
6b37272
 
 
 
 
16412d8
 
b37ba50
 
 
16412d8
 
 
 
6b37272
 
 
16412d8
b37ba50
 
16412d8
 
7e04624
16412d8
 
7e04624
 
 
 
 
16412d8
7e04624
16412d8
6b37272
16412d8
 
 
 
7e04624
16412d8
7e04624
16412d8
 
7e04624
6b37272
16412d8
6b37272
7e04624
 
 
 
6b37272
7e04624
6b37272
16412d8
 
b37ba50
 
 
 
 
7e04624
 
b37ba50
6b37272
 
 
 
7e04624
6b37272
 
 
7e04624
b37ba50
 
6b37272
 
7e04624
6b37272
 
 
7e04624
 
6b37272
 
16412d8
6b37272
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
os.environ["MEDIAPIPE_DISABLE_GPU"] = "1"

import gradio as gr
import cv2
import numpy as np
import json
import time
from gtts import gTTS
import tempfile
import mediapipe as mp
from mediapipe import solutions

# ---------------- LOAD GESTURES ----------------
with open("gestures_rules.json", "r") as f:
    gesture_data = json.load(f)["gestures"]

# ---------------- MEDIAPIPE (CPU ONLY) ----------------
mp_hands = solutions.hands
hands = mp_hands.Hands(
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7
)

# ---------------- UTIL ----------------
def get_finger_states(hand_landmarks):
    tips = [4, 8, 12, 16, 20]
    pips = [2, 6, 10, 14, 18]
    return [
        1 if hand_landmarks.landmark[t].y <
        hand_landmarks.landmark[p].y else 0
        for t, p in zip(tips, pips)
    ]

def detect_gesture(states):
    for name, rule in gesture_data.items():
        if rule["pattern"] == states:
            return name
    return None

def speak_text(text):
    tts = gTTS(text=text)
    f = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
    tts.save(f.name)
    return f.name

# ---------------- FRAME PROCESS ----------------
def process_frame(frame, sentence, last_char, last_time):
    if frame is None:
        return frame, sentence, last_char, last_time

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)

    if result.multi_hand_landmarks:
        hand = result.multi_hand_landmarks[0]
        states = get_finger_states(hand)
        char = detect_gesture(states)

        now = time.time()
        if char and char != last_char and now - last_time > 1:
            sentence += char
            last_char = char
            last_time = now

    return frame, sentence, last_char, last_time

def clear_text():
    return "", "", 0.0

def speak(sentence):
    return speak_text(sentence) if sentence else None

# ---------------- CSS ----------------
with open("styles.css") as f:
    custom_css = f.read()

# ---------------- UI ----------------
with gr.Blocks(title="Hand2Voice") as demo:

    gr.Markdown("## 🀟 Hand2Voice – Gesture to Speech")

    with gr.Row():
        with gr.Column():
            webcam = gr.Image(
                label="Webcam",
                type="numpy",
                live=True
            )

        with gr.Column():
            output = gr.HTML("<h3>Waiting for gestures...</h3>")
            speak_btn = gr.Button("πŸ”Š Speak")
            clear_btn = gr.Button("🧹 Clear")
            audio = gr.Audio(autoplay=True)

    sentence = gr.State("")
    last_char = gr.State("")
    last_time = gr.State(0.0)

    webcam.change(
        process_frame,
        inputs=[webcam, sentence, last_char, last_time],
        outputs=[webcam, sentence, last_char, last_time]
    ).then(
        lambda s: f"<h2>{s}</h2>",
        sentence,
        output
    )

    clear_btn.click(clear_text, outputs=[sentence, last_char, last_time])
    speak_btn.click(speak, sentence, audio)

demo.launch(css=custom_css)