Spaces:

swadha
/

sign_language

Runtime error

App Files Files Community

sign_language / app.py

swadha

Update app.py

d6ad40f verified 4 months ago

raw

history blame contribute delete

7.2 kB

	# app.py — SpellTalk (polished UI for HF Spaces)

	import os
	os.environ["YOLO_CONFIG_DIR"] = "/tmp/Ultralytics" # writable dir on Spaces

	import time, string
	from collections import deque, Counter
	import cv2
	import numpy as np
	import gradio as gr
	from ultralytics import YOLO
	from huggingface_hub import hf_hub_download

	# ---- FastRTC WebRTC component
	from fastrtc import WebRTC

	# ================== Model ==================
	weights_path = hf_hub_download("atalaydenknalbant/asl-yolo-models", filename="yolo11n.pt")
	model = YOLO(weights_path)
	id2label = model.names
	VALID = {lbl.upper() for lbl in id2label.values() if len(lbl) == 1 and lbl in string.ascii_letters}

	# ================== App State (globals) ==================
	CONF_THRES = 0.35
	HIST_LEN = 9 # history window (frames)
	COMMIT_STREAK = 5 # frames of agreement to lock a letter
	WORD_BREAK_SECONDS = 1.0

	history = deque(maxlen=HIST_LEN)
	current_word, sentence = [], []
	last_commit_time = time.time()

	HELP_CHART_URL = "https://commons.wikimedia.org/wiki/Special:FilePath/Asl_alphabet_gallaudet_ann.png"

	# ================== Helpers ==================
	def most_common(q):
	items = [x for x in q if x is not None]
	if not items:
	return None, 0
	c = Counter(items).most_common(1)[0]
	return c[0], c[1]

	def reset_word():
	global current_word, history
	current_word = []
	history.clear()

	def reset_sentence():
	global sentence, current_word, history
	sentence = []
	current_word = []
	history.clear()

	# ================== Video Frame Handler ==================
	def process_frame(frame: np.ndarray):
	"""
	FastRTC handler: receives RGB (H, W, 3); returns annotated RGB.
	Keeps running state in globals so words persist across frames.
	"""
	global history, current_word, sentence, last_commit_time

	if frame is None:
	return None

	# mirror like a selfie camera
	frame = cv2.flip(frame, 1)

	# run detection
	try:
	res = model.predict(frame, verbose=False, conf=CONF_THRES)
	except Exception as e:
	cv2.putText(frame, f"Model error: {e}", (10, 30),
	cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
	return frame

	letter, conf = None, 0.0
	if res and len(res[0].boxes) > 0:
	scores = res[0].boxes.conf.cpu().numpy()
	k = scores.argmax()
	idx = int(res[0].boxes.cls[k].item())
	conf = float(scores[k])
	cand = id2label.get(idx, "").upper()
	if cand in VALID:
	letter = cand

	history.append(letter)
	top, count = most_common(history)

	now = time.time()
	if top and count >= COMMIT_STREAK:
	current_word.append(top)
	history.clear()
	last_commit_time = now

	if (top is None) and (now - last_commit_time > WORD_BREAK_SECONDS) and current_word:
	sentence.append("".join(current_word))
	current_word = []
	last_commit_time = now

	# ---------- overlay ----------
	h, w = frame.shape[:2]
	overlay = frame.copy()
	cv2.rectangle(overlay, (0, 0), (w, 120), (0, 0, 0), -1) # header bar
	frame = cv2.addWeighted(overlay, 0.55, frame, 0.45, 0)

	label = top if top else "—"
	cv2.putText(frame, f"Letter: {label} (stability {count}/{COMMIT_STREAK})",
	(16, 64), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2)

	pct = int((w - 32) * (min(1.0, count/COMMIT_STREAK) if top else 0.0))
	cv2.rectangle(frame, (16, 74), (16 + pct, 84), (255, 255, 255), -1)

	word_str = "".join(current_word)
	sent_str = " ".join(sentence[-12:]) if sentence else ""
	cv2.putText(frame, f"Word: {word_str} \| Sentence: {sent_str}",
	(16, 104), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 1)

	return frame

	# ================== Gradio UI ==================
	THEME = gr.themes.Soft(
	primary_hue="blue",
	neutral_hue="slate"
	)

	CSS = """
	/* container width and centering */
	#app {max-width: 980px; margin: 0 auto;}
	/* sticky title */
	.sticky-head {position: sticky; top: 0; z-index: 5; backdrop-filter: blur(6px);}
	.sticky-head > * {margin: 0 !important;}
	/* webcam panel */
	.webrtc-wrap {display:flex; justify-content:center;}
	.webrtc-wrap .wrap {width: 100%;}
	/* shrink the help chart */
	.help-chart img {max-height: 360px; width: auto;}
	/* tighter accordions */
	.gr-accordion .label {font-weight: 600;}
	/* mobile niceties */
	@media (max-width: 768px){
	.help-chart img {max-height: 240px;}
	}
	"""

	with gr.Blocks(title="SpellTalk — ASL fingerspelling (live)", theme=THEME, css=CSS, elem_id="app") as demo:
	with gr.Column(elem_classes="sticky-head"):
	gr.Markdown("## 🖐️ SpellTalk – ASL fingerspelling → words (live stream)")
	gr.Markdown(
	"Click Start camera below. Hold one hand in frame, shoulder-high, palm per the chart. "
	"When a letter is stable, it locks into the Word. Pause briefly to end a word."
	)

	with gr.Row():
	with gr.Column(scale=3):
	with gr.Group(elem_classes="webrtc-wrap"):
	webrtc = WebRTC(label="Webcam (live)", mode="send-receive", modality="video")
	with gr.Row():
	clear_word = gr.Button("Clear word")
	clear_sent = gr.Button("Clear sentence")

	with gr.Accordion("Settings", open=False):
	th = gr.Slider(0.05, 0.9, value=CONF_THRES, step=0.05, label="Confidence threshold")
	hist = gr.Slider(3, 21, value=HIST_LEN, step=1, label="History length (frames)")
	streak = gr.Slider(2, 12, value=COMMIT_STREAK, step=1, label="Stability to lock (frames)")
	pause = gr.Slider(0.3, 2.5, value=WORD_BREAK_SECONDS, step=0.1, label="Pause → new word (seconds)")

	with gr.Column(scale=2):
	with gr.Group():
	gr.Markdown("ASL alphabet (reference)")
	gr.Image(value=HELP_CHART_URL, show_label=False, elem_classes="help-chart", interactive=False)
	gr.Markdown(
	"Tips:\n"
	"- Good, even lighting; plain background.\n"
	"- Keep hand within the frame; avoid fast motion.\n"
	"- Letters like J/Z need motion; this model is tuned mostly for A–Y."
	)

	# Wire up runtime updates for global settings
	def _set_conf(v):
	global CONF_THRES; CONF_THRES = float(v)
	def _set_hist(v):
	global HIST_LEN, history; HIST_LEN = int(v); history = deque(history, maxlen=HIST_LEN)
	def _set_streak(v):
	global COMMIT_STREAK; COMMIT_STREAK = int(v)
	def _set_pause(v):
	global WORD_BREAK_SECONDS; WORD_BREAK_SECONDS = float(v)

	th.change(_set_conf, th, None)
	hist.change(_set_hist, hist, None)
	streak.change(_set_streak, streak, None)
	pause.change(_set_pause, pause, None)

	clear_word.click(lambda: reset_word(), None, None)
	clear_sent.click(lambda: reset_sentence(), None, None)

	# Stream hookup (10 min per session keeps Spaces happy)
	webrtc.stream(fn=process_frame, inputs=[webrtc], outputs=[webrtc], time_limit=600)

	if __name__ == "__main__":
	demo.queue()
	demo.launch()