# app.py — SpellTalk (polished UI for HF Spaces) import os os.environ["YOLO_CONFIG_DIR"] = "/tmp/Ultralytics" # writable dir on Spaces import time, string from collections import deque, Counter import cv2 import numpy as np import gradio as gr from ultralytics import YOLO from huggingface_hub import hf_hub_download # ---- FastRTC WebRTC component from fastrtc import WebRTC # ================== Model ================== weights_path = hf_hub_download("atalaydenknalbant/asl-yolo-models", filename="yolo11n.pt") model = YOLO(weights_path) id2label = model.names VALID = {lbl.upper() for lbl in id2label.values() if len(lbl) == 1 and lbl in string.ascii_letters} # ================== App State (globals) ================== CONF_THRES = 0.35 HIST_LEN = 9 # history window (frames) COMMIT_STREAK = 5 # frames of agreement to lock a letter WORD_BREAK_SECONDS = 1.0 history = deque(maxlen=HIST_LEN) current_word, sentence = [], [] last_commit_time = time.time() HELP_CHART_URL = "https://commons.wikimedia.org/wiki/Special:FilePath/Asl_alphabet_gallaudet_ann.png" # ================== Helpers ================== def most_common(q): items = [x for x in q if x is not None] if not items: return None, 0 c = Counter(items).most_common(1)[0] return c[0], c[1] def reset_word(): global current_word, history current_word = [] history.clear() def reset_sentence(): global sentence, current_word, history sentence = [] current_word = [] history.clear() # ================== Video Frame Handler ================== def process_frame(frame: np.ndarray): """ FastRTC handler: receives RGB (H, W, 3); returns annotated RGB. Keeps running state in globals so words persist across frames. """ global history, current_word, sentence, last_commit_time if frame is None: return None # mirror like a selfie camera frame = cv2.flip(frame, 1) # run detection try: res = model.predict(frame, verbose=False, conf=CONF_THRES) except Exception as e: cv2.putText(frame, f"Model error: {e}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2) return frame letter, conf = None, 0.0 if res and len(res[0].boxes) > 0: scores = res[0].boxes.conf.cpu().numpy() k = scores.argmax() idx = int(res[0].boxes.cls[k].item()) conf = float(scores[k]) cand = id2label.get(idx, "").upper() if cand in VALID: letter = cand history.append(letter) top, count = most_common(history) now = time.time() if top and count >= COMMIT_STREAK: current_word.append(top) history.clear() last_commit_time = now if (top is None) and (now - last_commit_time > WORD_BREAK_SECONDS) and current_word: sentence.append("".join(current_word)) current_word = [] last_commit_time = now # ---------- overlay ---------- h, w = frame.shape[:2] overlay = frame.copy() cv2.rectangle(overlay, (0, 0), (w, 120), (0, 0, 0), -1) # header bar frame = cv2.addWeighted(overlay, 0.55, frame, 0.45, 0) label = top if top else "—" cv2.putText(frame, f"Letter: {label} (stability {count}/{COMMIT_STREAK})", (16, 64), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2) pct = int((w - 32) * (min(1.0, count/COMMIT_STREAK) if top else 0.0)) cv2.rectangle(frame, (16, 74), (16 + pct, 84), (255, 255, 255), -1) word_str = "".join(current_word) sent_str = " ".join(sentence[-12:]) if sentence else "" cv2.putText(frame, f"Word: {word_str} | Sentence: {sent_str}", (16, 104), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 1) return frame # ================== Gradio UI ================== THEME = gr.themes.Soft( primary_hue="blue", neutral_hue="slate" ) CSS = """ /* container width and centering */ #app {max-width: 980px; margin: 0 auto;} /* sticky title */ .sticky-head {position: sticky; top: 0; z-index: 5; backdrop-filter: blur(6px);} .sticky-head > * {margin: 0 !important;} /* webcam panel */ .webrtc-wrap {display:flex; justify-content:center;} .webrtc-wrap .wrap {width: 100%;} /* shrink the help chart */ .help-chart img {max-height: 360px; width: auto;} /* tighter accordions */ .gr-accordion .label {font-weight: 600;} /* mobile niceties */ @media (max-width: 768px){ .help-chart img {max-height: 240px;} } """ with gr.Blocks(title="SpellTalk — ASL fingerspelling (live)", theme=THEME, css=CSS, elem_id="app") as demo: with gr.Column(elem_classes="sticky-head"): gr.Markdown("## 🖐️ SpellTalk – ASL fingerspelling → words (live stream)") gr.Markdown( "Click **Start camera** below. Hold one hand in frame, shoulder-high, palm per the chart. " "When a letter is stable, it locks into the **Word**. Pause briefly to end a word." ) with gr.Row(): with gr.Column(scale=3): with gr.Group(elem_classes="webrtc-wrap"): webrtc = WebRTC(label="Webcam (live)", mode="send-receive", modality="video") with gr.Row(): clear_word = gr.Button("Clear word") clear_sent = gr.Button("Clear sentence") with gr.Accordion("Settings", open=False): th = gr.Slider(0.05, 0.9, value=CONF_THRES, step=0.05, label="Confidence threshold") hist = gr.Slider(3, 21, value=HIST_LEN, step=1, label="History length (frames)") streak = gr.Slider(2, 12, value=COMMIT_STREAK, step=1, label="Stability to lock (frames)") pause = gr.Slider(0.3, 2.5, value=WORD_BREAK_SECONDS, step=0.1, label="Pause → new word (seconds)") with gr.Column(scale=2): with gr.Group(): gr.Markdown("**ASL alphabet (reference)**") gr.Image(value=HELP_CHART_URL, show_label=False, elem_classes="help-chart", interactive=False) gr.Markdown( "Tips:\n" "- Good, even lighting; plain background.\n" "- Keep hand within the frame; avoid fast motion.\n" "- Letters like **J**/**Z** need motion; this model is tuned mostly for A–Y." ) # Wire up runtime updates for global settings def _set_conf(v): global CONF_THRES; CONF_THRES = float(v) def _set_hist(v): global HIST_LEN, history; HIST_LEN = int(v); history = deque(history, maxlen=HIST_LEN) def _set_streak(v): global COMMIT_STREAK; COMMIT_STREAK = int(v) def _set_pause(v): global WORD_BREAK_SECONDS; WORD_BREAK_SECONDS = float(v) th.change(_set_conf, th, None) hist.change(_set_hist, hist, None) streak.change(_set_streak, streak, None) pause.change(_set_pause, pause, None) clear_word.click(lambda: reset_word(), None, None) clear_sent.click(lambda: reset_sentence(), None, None) # Stream hookup (10 min per session keeps Spaces happy) webrtc.stream(fn=process_frame, inputs=[webrtc], outputs=[webrtc], time_limit=600) if __name__ == "__main__": demo.queue() demo.launch()