File size: 7,198 Bytes
224fbc7
 
fe83083
224fbc7
d861ba1
1433ac1
672fcbb
 
 
 
 
 
 
224fbc7
2597abb
4e68a78
224fbc7
672fcbb
 
 
fe83083
672fcbb
224fbc7
672fcbb
224fbc7
 
672fcbb
 
 
 
 
 
224fbc7
 
 
672fcbb
 
fe83083
 
672fcbb
 
 
224fbc7
 
 
 
 
 
 
 
 
 
 
 
4e68a78
672fcbb
224fbc7
 
672fcbb
 
1433ac1
672fcbb
4e68a78
672fcbb
224fbc7
b0095c4
 
224fbc7
d861ba1
 
 
224fbc7
 
4e68a78
fe83083
224fbc7
672fcbb
 
 
 
2597abb
672fcbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224fbc7
672fcbb
 
224fbc7
672fcbb
 
 
 
2597abb
4e68a78
 
2597abb
672fcbb
 
4e68a78
 
2597abb
672fcbb
4e68a78
672fcbb
224fbc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1433ac1
672fcbb
224fbc7
8149339
224fbc7
 
 
 
 
 
 
 
 
 
 
 
8149339
224fbc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e68a78
672fcbb
4e68a78
d6ad40f
4e68a78
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# app.py β€” SpellTalk (polished UI for HF Spaces)

import os
os.environ["YOLO_CONFIG_DIR"] = "/tmp/Ultralytics"   # writable dir on Spaces

import time, string
from collections import deque, Counter
import cv2
import numpy as np
import gradio as gr
from ultralytics import YOLO
from huggingface_hub import hf_hub_download

# ---- FastRTC WebRTC component
from fastrtc import WebRTC

# ================== Model ==================
weights_path = hf_hub_download("atalaydenknalbant/asl-yolo-models", filename="yolo11n.pt")
model = YOLO(weights_path)
id2label = model.names
VALID = {lbl.upper() for lbl in id2label.values() if len(lbl) == 1 and lbl in string.ascii_letters}

# ================== App State (globals) ==================
CONF_THRES = 0.35
HIST_LEN = 9                 # history window (frames)
COMMIT_STREAK = 5            # frames of agreement to lock a letter
WORD_BREAK_SECONDS = 1.0

history = deque(maxlen=HIST_LEN)
current_word, sentence = [], []
last_commit_time = time.time()

HELP_CHART_URL = "https://commons.wikimedia.org/wiki/Special:FilePath/Asl_alphabet_gallaudet_ann.png"

# ================== Helpers ==================
def most_common(q):
    items = [x for x in q if x is not None]
    if not items:
        return None, 0
    c = Counter(items).most_common(1)[0]
    return c[0], c[1]

def reset_word():
    global current_word, history
    current_word = []
    history.clear()

def reset_sentence():
    global sentence, current_word, history
    sentence = []
    current_word = []
    history.clear()

# ================== Video Frame Handler ==================
def process_frame(frame: np.ndarray):
    """
    FastRTC handler: receives RGB (H, W, 3); returns annotated RGB.
    Keeps running state in globals so words persist across frames.
    """
    global history, current_word, sentence, last_commit_time

    if frame is None:
        return None

    # mirror like a selfie camera
    frame = cv2.flip(frame, 1)

    # run detection
    try:
        res = model.predict(frame, verbose=False, conf=CONF_THRES)
    except Exception as e:
        cv2.putText(frame, f"Model error: {e}", (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
        return frame

    letter, conf = None, 0.0
    if res and len(res[0].boxes) > 0:
        scores = res[0].boxes.conf.cpu().numpy()
        k = scores.argmax()
        idx = int(res[0].boxes.cls[k].item())
        conf = float(scores[k])
        cand = id2label.get(idx, "").upper()
        if cand in VALID:
            letter = cand

    history.append(letter)
    top, count = most_common(history)

    now = time.time()
    if top and count >= COMMIT_STREAK:
        current_word.append(top)
        history.clear()
        last_commit_time = now

    if (top is None) and (now - last_commit_time > WORD_BREAK_SECONDS) and current_word:
        sentence.append("".join(current_word))
        current_word = []
        last_commit_time = now

    # ---------- overlay ----------
    h, w = frame.shape[:2]
    overlay = frame.copy()
    cv2.rectangle(overlay, (0, 0), (w, 120), (0, 0, 0), -1)       # header bar
    frame = cv2.addWeighted(overlay, 0.55, frame, 0.45, 0)

    label = top if top else "β€”"
    cv2.putText(frame, f"Letter: {label}  (stability {count}/{COMMIT_STREAK})",
                (16, 64), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2)

    pct = int((w - 32) * (min(1.0, count/COMMIT_STREAK) if top else 0.0))
    cv2.rectangle(frame, (16, 74), (16 + pct, 84), (255, 255, 255), -1)

    word_str = "".join(current_word)
    sent_str = " ".join(sentence[-12:]) if sentence else ""
    cv2.putText(frame, f"Word: {word_str} | Sentence: {sent_str}",
                (16, 104), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 1)

    return frame

# ================== Gradio UI ==================
THEME = gr.themes.Soft(
    primary_hue="blue",
    neutral_hue="slate"
)

CSS = """
/* container width and centering */
#app {max-width: 980px; margin: 0 auto;}
/* sticky title */
.sticky-head {position: sticky; top: 0; z-index: 5; backdrop-filter: blur(6px);}
.sticky-head > * {margin: 0 !important;}
/* webcam panel */
.webrtc-wrap {display:flex; justify-content:center;}
.webrtc-wrap .wrap {width: 100%;}
/* shrink the help chart */
.help-chart img {max-height: 360px; width: auto;}
/* tighter accordions */
.gr-accordion .label {font-weight: 600;}
/* mobile niceties */
@media (max-width: 768px){
  .help-chart img {max-height: 240px;}
}
"""

with gr.Blocks(title="SpellTalk β€” ASL fingerspelling (live)", theme=THEME, css=CSS, elem_id="app") as demo:
    with gr.Column(elem_classes="sticky-head"):
        gr.Markdown("## πŸ–οΈ SpellTalk – ASL fingerspelling β†’ words (live stream)")
        gr.Markdown(
            "Click **Start camera** below. Hold one hand in frame, shoulder-high, palm per the chart. "
            "When a letter is stable, it locks into the **Word**. Pause briefly to end a word."
        )

    with gr.Row():
        with gr.Column(scale=3):
            with gr.Group(elem_classes="webrtc-wrap"):
                webrtc = WebRTC(label="Webcam (live)", mode="send-receive", modality="video")
            with gr.Row():
                clear_word = gr.Button("Clear word")
                clear_sent = gr.Button("Clear sentence")

            with gr.Accordion("Settings", open=False):
                th = gr.Slider(0.05, 0.9, value=CONF_THRES, step=0.05, label="Confidence threshold")
                hist = gr.Slider(3, 21, value=HIST_LEN, step=1, label="History length (frames)")
                streak = gr.Slider(2, 12, value=COMMIT_STREAK, step=1, label="Stability to lock (frames)")
                pause = gr.Slider(0.3, 2.5, value=WORD_BREAK_SECONDS, step=0.1, label="Pause β†’ new word (seconds)")

        with gr.Column(scale=2):
            with gr.Group():
                gr.Markdown("**ASL alphabet (reference)**")
                gr.Image(value=HELP_CHART_URL, show_label=False, elem_classes="help-chart", interactive=False)
            gr.Markdown(
                "Tips:\n"
                "- Good, even lighting; plain background.\n"
                "- Keep hand within the frame; avoid fast motion.\n"
                "- Letters like **J**/**Z** need motion; this model is tuned mostly for A–Y."
            )

    # Wire up runtime updates for global settings
    def _set_conf(v):
        global CONF_THRES; CONF_THRES = float(v)
    def _set_hist(v):
        global HIST_LEN, history; HIST_LEN = int(v); history = deque(history, maxlen=HIST_LEN)
    def _set_streak(v):
        global COMMIT_STREAK; COMMIT_STREAK = int(v)
    def _set_pause(v):
        global WORD_BREAK_SECONDS; WORD_BREAK_SECONDS = float(v)

    th.change(_set_conf, th, None)
    hist.change(_set_hist, hist, None)
    streak.change(_set_streak, streak, None)
    pause.change(_set_pause, pause, None)

    clear_word.click(lambda: reset_word(), None, None)
    clear_sent.click(lambda: reset_sentence(), None, None)

    # Stream hookup (10 min per session keeps Spaces happy)
    webrtc.stream(fn=process_frame, inputs=[webrtc], outputs=[webrtc], time_limit=600)

if __name__ == "__main__":
    demo.queue()
    demo.launch()