|
|
import streamlit as st
|
|
|
import cv2
|
|
|
import numpy as np
|
|
|
import requests
|
|
|
from gtts import gTTS
|
|
|
import tempfile
|
|
|
import pandas
|
|
|
|
|
|
|
|
|
st.set_page_config(page_title="Hand2Voice", layout="wide")
|
|
|
st.title("π€ Hand2Voice")
|
|
|
st.write("Hand Gesture to Voice Conversion")
|
|
|
|
|
|
|
|
|
GESTURE_URL = "https://raw.githubusercontent.com/imarshbir/Hand2Voice/main/gestures/gesture_rules.json"
|
|
|
|
|
|
|
|
|
@st.cache_data
|
|
|
def load_gestures():
|
|
|
return requests.get(GESTURE_URL).json()["gestures"]
|
|
|
|
|
|
|
|
|
@st.cache_resource
|
|
|
def load_mediapipe():
|
|
|
import mediapipe as mp
|
|
|
mp_hands = mp.solutions.hands
|
|
|
return mp_hands.Hands(
|
|
|
static_image_mode=True,
|
|
|
max_num_hands=1,
|
|
|
min_detection_confidence=0.7
|
|
|
)
|
|
|
|
|
|
|
|
|
def get_finger_states(hand_landmarks):
|
|
|
finger_tips = [4, 8, 12, 16, 20]
|
|
|
finger_bases = [2, 6, 10, 14, 18]
|
|
|
|
|
|
states = []
|
|
|
|
|
|
states.append(
|
|
|
1 if hand_landmarks.landmark[4].x >
|
|
|
hand_landmarks.landmark[3].x else 0
|
|
|
)
|
|
|
|
|
|
for tip, base in zip(finger_tips[1:], finger_bases[1:]):
|
|
|
states.append(
|
|
|
1 if hand_landmarks.landmark[tip].y <
|
|
|
hand_landmarks.landmark[base].y else 0
|
|
|
)
|
|
|
|
|
|
return states
|
|
|
|
|
|
|
|
|
def match_gesture(states, rules):
|
|
|
for name, info in rules.items():
|
|
|
if states == info["pattern"]:
|
|
|
return name
|
|
|
return "Unknown Gesture"
|
|
|
|
|
|
|
|
|
def recognize_gesture(frame, hands, rules):
|
|
|
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
|
|
result = hands.process(rgb)
|
|
|
|
|
|
if result.multi_hand_landmarks:
|
|
|
for hand_landmarks in result.multi_hand_landmarks:
|
|
|
states = get_finger_states(hand_landmarks)
|
|
|
return match_gesture(states, rules)
|
|
|
|
|
|
return "No Hand Detected"
|
|
|
|
|
|
|
|
|
def speak_text(text):
|
|
|
tts = gTTS(text=text, lang="en")
|
|
|
file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
|
|
|
tts.save(file.name)
|
|
|
return file.name
|
|
|
|
|
|
|
|
|
col1, col2 = st.columns(2)
|
|
|
|
|
|
with col1:
|
|
|
st.subheader("π· Camera")
|
|
|
image = st.camera_input("Capture hand gesture")
|
|
|
|
|
|
with col2:
|
|
|
st.subheader("π Output")
|
|
|
|
|
|
if image:
|
|
|
gestures = load_gestures()
|
|
|
hands = load_mediapipe()
|
|
|
|
|
|
img_bytes = image.getvalue()
|
|
|
img_array = np.frombuffer(img_bytes, np.uint8)
|
|
|
frame = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
|
|
|
|
|
gesture = recognize_gesture(frame, hands, gestures)
|
|
|
|
|
|
st.success(f"π {gesture}")
|
|
|
|
|
|
if gesture not in ["Unknown Gesture", "No Hand Detected"]:
|
|
|
audio = speak_text(gesture)
|
|
|
st.audio(audio)
|
|
|
else:
|
|
|
st.info("Capture an image to start")
|
|
|
|