Spaces:
Runtime error
Runtime error
File size: 7,198 Bytes
224fbc7 fe83083 224fbc7 d861ba1 1433ac1 672fcbb 224fbc7 2597abb 4e68a78 224fbc7 672fcbb fe83083 672fcbb 224fbc7 672fcbb 224fbc7 672fcbb 224fbc7 672fcbb fe83083 672fcbb 224fbc7 4e68a78 672fcbb 224fbc7 672fcbb 1433ac1 672fcbb 4e68a78 672fcbb 224fbc7 b0095c4 224fbc7 d861ba1 224fbc7 4e68a78 fe83083 224fbc7 672fcbb 2597abb 672fcbb 224fbc7 672fcbb 224fbc7 672fcbb 2597abb 4e68a78 2597abb 672fcbb 4e68a78 2597abb 672fcbb 4e68a78 672fcbb 224fbc7 1433ac1 672fcbb 224fbc7 8149339 224fbc7 8149339 224fbc7 4e68a78 672fcbb 4e68a78 d6ad40f 4e68a78 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | # app.py β SpellTalk (polished UI for HF Spaces)
import os
os.environ["YOLO_CONFIG_DIR"] = "/tmp/Ultralytics" # writable dir on Spaces
import time, string
from collections import deque, Counter
import cv2
import numpy as np
import gradio as gr
from ultralytics import YOLO
from huggingface_hub import hf_hub_download
# ---- FastRTC WebRTC component
from fastrtc import WebRTC
# ================== Model ==================
weights_path = hf_hub_download("atalaydenknalbant/asl-yolo-models", filename="yolo11n.pt")
model = YOLO(weights_path)
id2label = model.names
VALID = {lbl.upper() for lbl in id2label.values() if len(lbl) == 1 and lbl in string.ascii_letters}
# ================== App State (globals) ==================
CONF_THRES = 0.35
HIST_LEN = 9 # history window (frames)
COMMIT_STREAK = 5 # frames of agreement to lock a letter
WORD_BREAK_SECONDS = 1.0
history = deque(maxlen=HIST_LEN)
current_word, sentence = [], []
last_commit_time = time.time()
HELP_CHART_URL = "https://commons.wikimedia.org/wiki/Special:FilePath/Asl_alphabet_gallaudet_ann.png"
# ================== Helpers ==================
def most_common(q):
items = [x for x in q if x is not None]
if not items:
return None, 0
c = Counter(items).most_common(1)[0]
return c[0], c[1]
def reset_word():
global current_word, history
current_word = []
history.clear()
def reset_sentence():
global sentence, current_word, history
sentence = []
current_word = []
history.clear()
# ================== Video Frame Handler ==================
def process_frame(frame: np.ndarray):
"""
FastRTC handler: receives RGB (H, W, 3); returns annotated RGB.
Keeps running state in globals so words persist across frames.
"""
global history, current_word, sentence, last_commit_time
if frame is None:
return None
# mirror like a selfie camera
frame = cv2.flip(frame, 1)
# run detection
try:
res = model.predict(frame, verbose=False, conf=CONF_THRES)
except Exception as e:
cv2.putText(frame, f"Model error: {e}", (10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
return frame
letter, conf = None, 0.0
if res and len(res[0].boxes) > 0:
scores = res[0].boxes.conf.cpu().numpy()
k = scores.argmax()
idx = int(res[0].boxes.cls[k].item())
conf = float(scores[k])
cand = id2label.get(idx, "").upper()
if cand in VALID:
letter = cand
history.append(letter)
top, count = most_common(history)
now = time.time()
if top and count >= COMMIT_STREAK:
current_word.append(top)
history.clear()
last_commit_time = now
if (top is None) and (now - last_commit_time > WORD_BREAK_SECONDS) and current_word:
sentence.append("".join(current_word))
current_word = []
last_commit_time = now
# ---------- overlay ----------
h, w = frame.shape[:2]
overlay = frame.copy()
cv2.rectangle(overlay, (0, 0), (w, 120), (0, 0, 0), -1) # header bar
frame = cv2.addWeighted(overlay, 0.55, frame, 0.45, 0)
label = top if top else "β"
cv2.putText(frame, f"Letter: {label} (stability {count}/{COMMIT_STREAK})",
(16, 64), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2)
pct = int((w - 32) * (min(1.0, count/COMMIT_STREAK) if top else 0.0))
cv2.rectangle(frame, (16, 74), (16 + pct, 84), (255, 255, 255), -1)
word_str = "".join(current_word)
sent_str = " ".join(sentence[-12:]) if sentence else ""
cv2.putText(frame, f"Word: {word_str} | Sentence: {sent_str}",
(16, 104), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 1)
return frame
# ================== Gradio UI ==================
THEME = gr.themes.Soft(
primary_hue="blue",
neutral_hue="slate"
)
CSS = """
/* container width and centering */
#app {max-width: 980px; margin: 0 auto;}
/* sticky title */
.sticky-head {position: sticky; top: 0; z-index: 5; backdrop-filter: blur(6px);}
.sticky-head > * {margin: 0 !important;}
/* webcam panel */
.webrtc-wrap {display:flex; justify-content:center;}
.webrtc-wrap .wrap {width: 100%;}
/* shrink the help chart */
.help-chart img {max-height: 360px; width: auto;}
/* tighter accordions */
.gr-accordion .label {font-weight: 600;}
/* mobile niceties */
@media (max-width: 768px){
.help-chart img {max-height: 240px;}
}
"""
with gr.Blocks(title="SpellTalk β ASL fingerspelling (live)", theme=THEME, css=CSS, elem_id="app") as demo:
with gr.Column(elem_classes="sticky-head"):
gr.Markdown("## ποΈ SpellTalk β ASL fingerspelling β words (live stream)")
gr.Markdown(
"Click **Start camera** below. Hold one hand in frame, shoulder-high, palm per the chart. "
"When a letter is stable, it locks into the **Word**. Pause briefly to end a word."
)
with gr.Row():
with gr.Column(scale=3):
with gr.Group(elem_classes="webrtc-wrap"):
webrtc = WebRTC(label="Webcam (live)", mode="send-receive", modality="video")
with gr.Row():
clear_word = gr.Button("Clear word")
clear_sent = gr.Button("Clear sentence")
with gr.Accordion("Settings", open=False):
th = gr.Slider(0.05, 0.9, value=CONF_THRES, step=0.05, label="Confidence threshold")
hist = gr.Slider(3, 21, value=HIST_LEN, step=1, label="History length (frames)")
streak = gr.Slider(2, 12, value=COMMIT_STREAK, step=1, label="Stability to lock (frames)")
pause = gr.Slider(0.3, 2.5, value=WORD_BREAK_SECONDS, step=0.1, label="Pause β new word (seconds)")
with gr.Column(scale=2):
with gr.Group():
gr.Markdown("**ASL alphabet (reference)**")
gr.Image(value=HELP_CHART_URL, show_label=False, elem_classes="help-chart", interactive=False)
gr.Markdown(
"Tips:\n"
"- Good, even lighting; plain background.\n"
"- Keep hand within the frame; avoid fast motion.\n"
"- Letters like **J**/**Z** need motion; this model is tuned mostly for AβY."
)
# Wire up runtime updates for global settings
def _set_conf(v):
global CONF_THRES; CONF_THRES = float(v)
def _set_hist(v):
global HIST_LEN, history; HIST_LEN = int(v); history = deque(history, maxlen=HIST_LEN)
def _set_streak(v):
global COMMIT_STREAK; COMMIT_STREAK = int(v)
def _set_pause(v):
global WORD_BREAK_SECONDS; WORD_BREAK_SECONDS = float(v)
th.change(_set_conf, th, None)
hist.change(_set_hist, hist, None)
streak.change(_set_streak, streak, None)
pause.change(_set_pause, pause, None)
clear_word.click(lambda: reset_word(), None, None)
clear_sent.click(lambda: reset_sentence(), None, None)
# Stream hookup (10 min per session keeps Spaces happy)
webrtc.stream(fn=process_frame, inputs=[webrtc], outputs=[webrtc], time_limit=600)
if __name__ == "__main__":
demo.queue()
demo.launch()
|