Hand2Voice / app.py
arshtech's picture
Update app.py
d5df74d verified
raw
history blame
3.76 kB
import streamlit as st
import cv2
import numpy as np
import mediapipe as mp
import requests
from gtts import gTTS
import tempfile
import time
# ---------------- CONFIG ----------------
st.set_page_config(page_title="Hand2Voice", layout="wide")
GESTURE_URL = "https://raw.githubusercontent.com/YOUR_USERNAME/Hand2Voice/main/gestures/gesture_rules.json"
# ---------------- SESSION STATE ----------------
if "last_gesture" not in st.session_state:
st.session_state.last_gesture = ""
if "last_spoken" not in st.session_state:
st.session_state.last_spoken = ""
# ---------------- LOAD GESTURES ----------------
@st.cache_data
def load_gestures():
return requests.get(GESTURE_URL).json()["gestures"]
gesture_rules = load_gestures()
# ---------------- MEDIAPIPE ----------------
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
static_image_mode=True,
max_num_hands=1,
min_detection_confidence=0.7
)
# ---------------- FINGER LOGIC ----------------
def get_finger_states(hand_landmarks):
finger_tips = [4, 8, 12, 16, 20]
finger_bases = [2, 6, 10, 14, 18]
states = []
# Thumb
states.append(
1 if hand_landmarks.landmark[4].x >
hand_landmarks.landmark[3].x else 0
)
# Other fingers
for tip, base in zip(finger_tips[1:], finger_bases[1:]):
states.append(
1 if hand_landmarks.landmark[tip].y <
hand_landmarks.landmark[base].y else 0
)
return states
# ---------------- MATCH ----------------
def match_gesture(states):
for name, info in gesture_rules.items():
if states == info["pattern"]:
return name
return "Unknown"
# ---------------- RECOGNIZER ----------------
def recognize(frame):
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
result = hands.process(rgb)
if result.multi_hand_landmarks:
for hand_landmarks in result.multi_hand_landmarks:
states = get_finger_states(hand_landmarks)
return match_gesture(states)
return "No Hand"
# ---------------- TEXT TO SPEECH ----------------
def speak(text):
tts = gTTS(text=text, lang="en")
file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tts.save(file.name)
return file.name
# ---------------- UI ----------------
st.markdown(
"<h1 style='text-align:center;'>🀟 Hand2Voice</h1>"
"<p style='text-align:center;'>Real-Time Hand Gesture to Voice</p>",
unsafe_allow_html=True
)
st.divider()
col1, col2 = st.columns(2)
# -------- CAMERA --------
with col1:
st.subheader("πŸ“· Live Camera")
image = st.camera_input("Live gesture feed")
# -------- OUTPUT --------
with col2:
st.subheader("πŸ“ Live Output")
if image:
img_bytes = image.getvalue()
np_img = np.frombuffer(img_bytes, np.uint8)
frame = cv2.imdecode(np_img, cv2.IMREAD_COLOR)
gesture = recognize(frame)
st.session_state.last_gesture = gesture
st.markdown(
f"""
<div style="
padding:20px;
background-color:#dcfce7;
border-radius:12px;
font-size:24px;
font-weight:bold;
text-align:center;">
πŸ”Š {gesture}
</div>
""",
unsafe_allow_html=True
)
# Speak only if gesture changes
if gesture not in ["Unknown", "No Hand"] and gesture != st.session_state.last_spoken:
audio = speak(gesture)
st.audio(audio)
st.session_state.last_spoken = gesture
# Auto refresh (real-time effect)
time.sleep(0.5)
st.experimental_rerun()
else:
st.info("Enable camera and show hand gestures")