File size: 3,080 Bytes
db6710f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import streamlit as st
import cv2
import numpy as np
import requests
from gtts import gTTS
import tempfile
import pandas

# ---------------- STREAMLIT CONFIG ----------------
st.set_page_config(page_title="Hand2Voice", layout="wide")
st.title("🀟 Hand2Voice")
st.write("Hand Gesture to Voice Conversion")

# ---------------- CONSTANTS ----------------
GESTURE_URL = "https://raw.githubusercontent.com/imarshbir/Hand2Voice/main/gestures/gesture_rules.json"

# ---------------- LOAD GESTURES ----------------
@st.cache_data
def load_gestures():
    return requests.get(GESTURE_URL).json()["gestures"]

# ---------------- LAZY MEDIAPIPE ----------------
@st.cache_resource
def load_mediapipe():
    import mediapipe as mp
    mp_hands = mp.solutions.hands
    return mp_hands.Hands(
        static_image_mode=True,
        max_num_hands=1,
        min_detection_confidence=0.7
    )

# ---------------- FINGER LOGIC ----------------
def get_finger_states(hand_landmarks):
    finger_tips = [4, 8, 12, 16, 20]
    finger_bases = [2, 6, 10, 14, 18]

    states = []

    states.append(
        1 if hand_landmarks.landmark[4].x >
             hand_landmarks.landmark[3].x else 0
    )

    for tip, base in zip(finger_tips[1:], finger_bases[1:]):
        states.append(
            1 if hand_landmarks.landmark[tip].y <
                 hand_landmarks.landmark[base].y else 0
        )

    return states

# ---------------- MATCH GESTURE ----------------
def match_gesture(states, rules):
    for name, info in rules.items():
        if states == info["pattern"]:
            return name
    return "Unknown Gesture"

# ---------------- RECOGNITION ----------------
def recognize_gesture(frame, hands, rules):
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)

    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            states = get_finger_states(hand_landmarks)
            return match_gesture(states, rules)

    return "No Hand Detected"

# ---------------- TEXT TO SPEECH ----------------
def speak_text(text):
    tts = gTTS(text=text, lang="en")
    file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
    tts.save(file.name)
    return file.name

# ---------------- UI ----------------
col1, col2 = st.columns(2)

with col1:
    st.subheader("πŸ“· Camera")
    image = st.camera_input("Capture hand gesture")

with col2:
    st.subheader("πŸ“ Output")

    if image:
        gestures = load_gestures()
        hands = load_mediapipe()

        img_bytes = image.getvalue()
        img_array = np.frombuffer(img_bytes, np.uint8)
        frame = cv2.imdecode(img_array, cv2.IMREAD_COLOR)

        gesture = recognize_gesture(frame, hands, gestures)

        st.success(f"πŸ”Š {gesture}")

        if gesture not in ["Unknown Gesture", "No Hand Detected"]:
            audio = speak_text(gesture)
            st.audio(audio)
    else:
        st.info("Capture an image to start")