File size: 3,080 Bytes
db6710f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import streamlit as st
import cv2
import numpy as np
import requests
from gtts import gTTS
import tempfile
import pandas
# ---------------- STREAMLIT CONFIG ----------------
st.set_page_config(page_title="Hand2Voice", layout="wide")
st.title("π€ Hand2Voice")
st.write("Hand Gesture to Voice Conversion")
# ---------------- CONSTANTS ----------------
GESTURE_URL = "https://raw.githubusercontent.com/imarshbir/Hand2Voice/main/gestures/gesture_rules.json"
# ---------------- LOAD GESTURES ----------------
@st.cache_data
def load_gestures():
return requests.get(GESTURE_URL).json()["gestures"]
# ---------------- LAZY MEDIAPIPE ----------------
@st.cache_resource
def load_mediapipe():
import mediapipe as mp
mp_hands = mp.solutions.hands
return mp_hands.Hands(
static_image_mode=True,
max_num_hands=1,
min_detection_confidence=0.7
)
# ---------------- FINGER LOGIC ----------------
def get_finger_states(hand_landmarks):
finger_tips = [4, 8, 12, 16, 20]
finger_bases = [2, 6, 10, 14, 18]
states = []
states.append(
1 if hand_landmarks.landmark[4].x >
hand_landmarks.landmark[3].x else 0
)
for tip, base in zip(finger_tips[1:], finger_bases[1:]):
states.append(
1 if hand_landmarks.landmark[tip].y <
hand_landmarks.landmark[base].y else 0
)
return states
# ---------------- MATCH GESTURE ----------------
def match_gesture(states, rules):
for name, info in rules.items():
if states == info["pattern"]:
return name
return "Unknown Gesture"
# ---------------- RECOGNITION ----------------
def recognize_gesture(frame, hands, rules):
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
result = hands.process(rgb)
if result.multi_hand_landmarks:
for hand_landmarks in result.multi_hand_landmarks:
states = get_finger_states(hand_landmarks)
return match_gesture(states, rules)
return "No Hand Detected"
# ---------------- TEXT TO SPEECH ----------------
def speak_text(text):
tts = gTTS(text=text, lang="en")
file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tts.save(file.name)
return file.name
# ---------------- UI ----------------
col1, col2 = st.columns(2)
with col1:
st.subheader("π· Camera")
image = st.camera_input("Capture hand gesture")
with col2:
st.subheader("π Output")
if image:
gestures = load_gestures()
hands = load_mediapipe()
img_bytes = image.getvalue()
img_array = np.frombuffer(img_bytes, np.uint8)
frame = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
gesture = recognize_gesture(frame, hands, gestures)
st.success(f"π {gesture}")
if gesture not in ["Unknown Gesture", "No Hand Detected"]:
audio = speak_text(gesture)
st.audio(audio)
else:
st.info("Capture an image to start")
|