Spaces:

swadha
/

sign_language

Runtime error

File size: 7,198 Bytes

224fbc7
 
fe83083
224fbc7
d861ba1
1433ac1
672fcbb
 
 
 
 
 
 
224fbc7
2597abb
4e68a78
224fbc7
672fcbb
 
 
fe83083
672fcbb
224fbc7
672fcbb
224fbc7
 
672fcbb
 
 
 
 
 
224fbc7
 
 
672fcbb
 
fe83083
 
672fcbb
 
 
224fbc7
 
 
 
 
 
 
 
 
 
 
 
4e68a78
672fcbb
224fbc7
 
672fcbb
 
1433ac1
672fcbb
4e68a78
672fcbb
224fbc7
b0095c4
 
224fbc7
d861ba1
 
 
224fbc7
 
4e68a78
fe83083
224fbc7
672fcbb
 
 
 
2597abb
672fcbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224fbc7
672fcbb
 
224fbc7
672fcbb
 
 
 
2597abb
4e68a78
 
2597abb
672fcbb
 
4e68a78
 
2597abb
672fcbb
4e68a78
672fcbb
224fbc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1433ac1
672fcbb
224fbc7
8149339
224fbc7
 
 
 
 
 
 
 
 
 
 
 
8149339
224fbc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e68a78
672fcbb
4e68a78
d6ad40f
4e68a78

# app.py — SpellTalk (polished UI for HF Spaces)

import os
os.environ["YOLO_CONFIG_DIR"] = "/tmp/Ultralytics"   # writable dir on Spaces

import time, string
from collections import deque, Counter
import cv2
import numpy as np
import gradio as gr
from ultralytics import YOLO
from huggingface_hub import hf_hub_download

# ---- FastRTC WebRTC component
from fastrtc import WebRTC

# ================== Model ==================
weights_path = hf_hub_download("atalaydenknalbant/asl-yolo-models", filename="yolo11n.pt")
model = YOLO(weights_path)
id2label = model.names
VALID = {lbl.upper() for lbl in id2label.values() if len(lbl) == 1 and lbl in string.ascii_letters}

# ================== App State (globals) ==================
CONF_THRES = 0.35
HIST_LEN = 9                 # history window (frames)
COMMIT_STREAK = 5            # frames of agreement to lock a letter
WORD_BREAK_SECONDS = 1.0

history = deque(maxlen=HIST_LEN)
current_word, sentence = [], []
last_commit_time = time.time()

HELP_CHART_URL = "https://commons.wikimedia.org/wiki/Special:FilePath/Asl_alphabet_gallaudet_ann.png"

# ================== Helpers ==================
def most_common(q):
    items = [x for x in q if x is not None]
    if not items:
        return None, 0
    c = Counter(items).most_common(1)[0]
    return c[0], c[1]

def reset_word():
    global current_word, history
    current_word = []
    history.clear()

def reset_sentence():
    global sentence, current_word, history
    sentence = []
    current_word = []
    history.clear()

# ================== Video Frame Handler ==================
def process_frame(frame: np.ndarray):
    """
    FastRTC handler: receives RGB (H, W, 3); returns annotated RGB.
    Keeps running state in globals so words persist across frames.
    """
    global history, current_word, sentence, last_commit_time

    if frame is None:
        return None

    # mirror like a selfie camera
    frame = cv2.flip(frame, 1)

    # run detection
    try:
        res = model.predict(frame, verbose=False, conf=CONF_THRES)
    except Exception as e:
        cv2.putText(frame, f"Model error: {e}", (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
        return frame

    letter, conf = None, 0.0
    if res and len(res[0].boxes) > 0:
        scores = res[0].boxes.conf.cpu().numpy()
        k = scores.argmax()
        idx = int(res[0].boxes.cls[k].item())
        conf = float(scores[k])
        cand = id2label.get(idx, "").upper()
        if cand in VALID:
            letter = cand

    history.append(letter)
    top, count = most_common(history)

    now = time.time()
    if top and count >= COMMIT_STREAK:
        current_word.append(top)
        history.clear()
        last_commit_time = now

    if (top is None) and (now - last_commit_time > WORD_BREAK_SECONDS) and current_word:
        sentence.append("".join(current_word))
        current_word = []
        last_commit_time = now

    # ---------- overlay ----------
    h, w = frame.shape[:2]
    overlay = frame.copy()
    cv2.rectangle(overlay, (0, 0), (w, 120), (0, 0, 0), -1)       # header bar
    frame = cv2.addWeighted(overlay, 0.55, frame, 0.45, 0)

    label = top if top else "—"
    cv2.putText(frame, f"Letter: {label}  (stability {count}/{COMMIT_STREAK})",
                (16, 64), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2)

    pct = int((w - 32) * (min(1.0, count/COMMIT_STREAK) if top else 0.0))
    cv2.rectangle(frame, (16, 74), (16 + pct, 84), (255, 255, 255), -1)

    word_str = "".join(current_word)
    sent_str = " ".join(sentence[-12:]) if sentence else ""
    cv2.putText(frame, f"Word: {word_str} | Sentence: {sent_str}",
                (16, 104), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 1)

    return frame

# ================== Gradio UI ==================
THEME = gr.themes.Soft(
    primary_hue="blue",
    neutral_hue="slate"
)

CSS = """
/* container width and centering */
#app {max-width: 980px; margin: 0 auto;}
/* sticky title */
.sticky-head {position: sticky; top: 0; z-index: 5; backdrop-filter: blur(6px);}
.sticky-head > * {margin: 0 !important;}
/* webcam panel */
.webrtc-wrap {display:flex; justify-content:center;}
.webrtc-wrap .wrap {width: 100%;}
/* shrink the help chart */
.help-chart img {max-height: 360px; width: auto;}
/* tighter accordions */
.gr-accordion .label {font-weight: 600;}
/* mobile niceties */
@media (max-width: 768px){
  .help-chart img {max-height: 240px;}
}
"""

with gr.Blocks(title="SpellTalk — ASL fingerspelling (live)", theme=THEME, css=CSS, elem_id="app") as demo:
    with gr.Column(elem_classes="sticky-head"):
        gr.Markdown("## 🖐️ SpellTalk – ASL fingerspelling → words (live stream)")
        gr.Markdown(
            "Click **Start camera** below. Hold one hand in frame, shoulder-high, palm per the chart. "
            "When a letter is stable, it locks into the **Word**. Pause briefly to end a word."
        )

    with gr.Row():
        with gr.Column(scale=3):
            with gr.Group(elem_classes="webrtc-wrap"):
                webrtc = WebRTC(label="Webcam (live)", mode="send-receive", modality="video")
            with gr.Row():
                clear_word = gr.Button("Clear word")
                clear_sent = gr.Button("Clear sentence")

            with gr.Accordion("Settings", open=False):
                th = gr.Slider(0.05, 0.9, value=CONF_THRES, step=0.05, label="Confidence threshold")
                hist = gr.Slider(3, 21, value=HIST_LEN, step=1, label="History length (frames)")
                streak = gr.Slider(2, 12, value=COMMIT_STREAK, step=1, label="Stability to lock (frames)")
                pause = gr.Slider(0.3, 2.5, value=WORD_BREAK_SECONDS, step=0.1, label="Pause → new word (seconds)")

        with gr.Column(scale=2):
            with gr.Group():
                gr.Markdown("**ASL alphabet (reference)**")
                gr.Image(value=HELP_CHART_URL, show_label=False, elem_classes="help-chart", interactive=False)
            gr.Markdown(
                "Tips:\n"
                "- Good, even lighting; plain background.\n"
                "- Keep hand within the frame; avoid fast motion.\n"
                "- Letters like **J**/**Z** need motion; this model is tuned mostly for A–Y."
            )

    # Wire up runtime updates for global settings
    def _set_conf(v):
        global CONF_THRES; CONF_THRES = float(v)
    def _set_hist(v):
        global HIST_LEN, history; HIST_LEN = int(v); history = deque(history, maxlen=HIST_LEN)
    def _set_streak(v):
        global COMMIT_STREAK; COMMIT_STREAK = int(v)
    def _set_pause(v):
        global WORD_BREAK_SECONDS; WORD_BREAK_SECONDS = float(v)

    th.change(_set_conf, th, None)
    hist.change(_set_hist, hist, None)
    streak.change(_set_streak, streak, None)
    pause.change(_set_pause, pause, None)

    clear_word.click(lambda: reset_word(), None, None)
    clear_sent.click(lambda: reset_sentence(), None, None)

    # Stream hookup (10 min per session keeps Spaces happy)
    webrtc.stream(fn=process_frame, inputs=[webrtc], outputs=[webrtc], time_limit=600)

if __name__ == "__main__":
    demo.queue()
    demo.launch()