sign_language / app.py
swadha's picture
Update app.py
d6ad40f verified
# app.py β€” SpellTalk (polished UI for HF Spaces)
import os
os.environ["YOLO_CONFIG_DIR"] = "/tmp/Ultralytics" # writable dir on Spaces
import time, string
from collections import deque, Counter
import cv2
import numpy as np
import gradio as gr
from ultralytics import YOLO
from huggingface_hub import hf_hub_download
# ---- FastRTC WebRTC component
from fastrtc import WebRTC
# ================== Model ==================
weights_path = hf_hub_download("atalaydenknalbant/asl-yolo-models", filename="yolo11n.pt")
model = YOLO(weights_path)
id2label = model.names
VALID = {lbl.upper() for lbl in id2label.values() if len(lbl) == 1 and lbl in string.ascii_letters}
# ================== App State (globals) ==================
CONF_THRES = 0.35
HIST_LEN = 9 # history window (frames)
COMMIT_STREAK = 5 # frames of agreement to lock a letter
WORD_BREAK_SECONDS = 1.0
history = deque(maxlen=HIST_LEN)
current_word, sentence = [], []
last_commit_time = time.time()
HELP_CHART_URL = "https://commons.wikimedia.org/wiki/Special:FilePath/Asl_alphabet_gallaudet_ann.png"
# ================== Helpers ==================
def most_common(q):
items = [x for x in q if x is not None]
if not items:
return None, 0
c = Counter(items).most_common(1)[0]
return c[0], c[1]
def reset_word():
global current_word, history
current_word = []
history.clear()
def reset_sentence():
global sentence, current_word, history
sentence = []
current_word = []
history.clear()
# ================== Video Frame Handler ==================
def process_frame(frame: np.ndarray):
"""
FastRTC handler: receives RGB (H, W, 3); returns annotated RGB.
Keeps running state in globals so words persist across frames.
"""
global history, current_word, sentence, last_commit_time
if frame is None:
return None
# mirror like a selfie camera
frame = cv2.flip(frame, 1)
# run detection
try:
res = model.predict(frame, verbose=False, conf=CONF_THRES)
except Exception as e:
cv2.putText(frame, f"Model error: {e}", (10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
return frame
letter, conf = None, 0.0
if res and len(res[0].boxes) > 0:
scores = res[0].boxes.conf.cpu().numpy()
k = scores.argmax()
idx = int(res[0].boxes.cls[k].item())
conf = float(scores[k])
cand = id2label.get(idx, "").upper()
if cand in VALID:
letter = cand
history.append(letter)
top, count = most_common(history)
now = time.time()
if top and count >= COMMIT_STREAK:
current_word.append(top)
history.clear()
last_commit_time = now
if (top is None) and (now - last_commit_time > WORD_BREAK_SECONDS) and current_word:
sentence.append("".join(current_word))
current_word = []
last_commit_time = now
# ---------- overlay ----------
h, w = frame.shape[:2]
overlay = frame.copy()
cv2.rectangle(overlay, (0, 0), (w, 120), (0, 0, 0), -1) # header bar
frame = cv2.addWeighted(overlay, 0.55, frame, 0.45, 0)
label = top if top else "β€”"
cv2.putText(frame, f"Letter: {label} (stability {count}/{COMMIT_STREAK})",
(16, 64), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2)
pct = int((w - 32) * (min(1.0, count/COMMIT_STREAK) if top else 0.0))
cv2.rectangle(frame, (16, 74), (16 + pct, 84), (255, 255, 255), -1)
word_str = "".join(current_word)
sent_str = " ".join(sentence[-12:]) if sentence else ""
cv2.putText(frame, f"Word: {word_str} | Sentence: {sent_str}",
(16, 104), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 1)
return frame
# ================== Gradio UI ==================
THEME = gr.themes.Soft(
primary_hue="blue",
neutral_hue="slate"
)
CSS = """
/* container width and centering */
#app {max-width: 980px; margin: 0 auto;}
/* sticky title */
.sticky-head {position: sticky; top: 0; z-index: 5; backdrop-filter: blur(6px);}
.sticky-head > * {margin: 0 !important;}
/* webcam panel */
.webrtc-wrap {display:flex; justify-content:center;}
.webrtc-wrap .wrap {width: 100%;}
/* shrink the help chart */
.help-chart img {max-height: 360px; width: auto;}
/* tighter accordions */
.gr-accordion .label {font-weight: 600;}
/* mobile niceties */
@media (max-width: 768px){
.help-chart img {max-height: 240px;}
}
"""
with gr.Blocks(title="SpellTalk β€” ASL fingerspelling (live)", theme=THEME, css=CSS, elem_id="app") as demo:
with gr.Column(elem_classes="sticky-head"):
gr.Markdown("## πŸ–οΈ SpellTalk – ASL fingerspelling β†’ words (live stream)")
gr.Markdown(
"Click **Start camera** below. Hold one hand in frame, shoulder-high, palm per the chart. "
"When a letter is stable, it locks into the **Word**. Pause briefly to end a word."
)
with gr.Row():
with gr.Column(scale=3):
with gr.Group(elem_classes="webrtc-wrap"):
webrtc = WebRTC(label="Webcam (live)", mode="send-receive", modality="video")
with gr.Row():
clear_word = gr.Button("Clear word")
clear_sent = gr.Button("Clear sentence")
with gr.Accordion("Settings", open=False):
th = gr.Slider(0.05, 0.9, value=CONF_THRES, step=0.05, label="Confidence threshold")
hist = gr.Slider(3, 21, value=HIST_LEN, step=1, label="History length (frames)")
streak = gr.Slider(2, 12, value=COMMIT_STREAK, step=1, label="Stability to lock (frames)")
pause = gr.Slider(0.3, 2.5, value=WORD_BREAK_SECONDS, step=0.1, label="Pause β†’ new word (seconds)")
with gr.Column(scale=2):
with gr.Group():
gr.Markdown("**ASL alphabet (reference)**")
gr.Image(value=HELP_CHART_URL, show_label=False, elem_classes="help-chart", interactive=False)
gr.Markdown(
"Tips:\n"
"- Good, even lighting; plain background.\n"
"- Keep hand within the frame; avoid fast motion.\n"
"- Letters like **J**/**Z** need motion; this model is tuned mostly for A–Y."
)
# Wire up runtime updates for global settings
def _set_conf(v):
global CONF_THRES; CONF_THRES = float(v)
def _set_hist(v):
global HIST_LEN, history; HIST_LEN = int(v); history = deque(history, maxlen=HIST_LEN)
def _set_streak(v):
global COMMIT_STREAK; COMMIT_STREAK = int(v)
def _set_pause(v):
global WORD_BREAK_SECONDS; WORD_BREAK_SECONDS = float(v)
th.change(_set_conf, th, None)
hist.change(_set_hist, hist, None)
streak.change(_set_streak, streak, None)
pause.change(_set_pause, pause, None)
clear_word.click(lambda: reset_word(), None, None)
clear_sent.click(lambda: reset_sentence(), None, None)
# Stream hookup (10 min per session keeps Spaces happy)
webrtc.stream(fn=process_frame, inputs=[webrtc], outputs=[webrtc], time_limit=600)
if __name__ == "__main__":
demo.queue()
demo.launch()