# app.py — SpellTalk (polished UI for HF Spaces)

import os
os.environ["YOLO_CONFIG_DIR"] = "/tmp/Ultralytics"   # writable dir on Spaces

import time, string
from collections import deque, Counter
import cv2
import numpy as np
import gradio as gr
from ultralytics import YOLO
from huggingface_hub import hf_hub_download

# ---- FastRTC WebRTC component
from fastrtc import WebRTC

# ================== Model ==================
weights_path = hf_hub_download("atalaydenknalbant/asl-yolo-models", filename="yolo11n.pt")
model = YOLO(weights_path)
id2label = model.names
VALID = {lbl.upper() for lbl in id2label.values() if len(lbl) == 1 and lbl in string.ascii_letters}

# ================== App State (globals) ==================
CONF_THRES = 0.35
HIST_LEN = 9                 # history window (frames)
COMMIT_STREAK = 5            # frames of agreement to lock a letter
WORD_BREAK_SECONDS = 1.0

history = deque(maxlen=HIST_LEN)
current_word, sentence = [], []
last_commit_time = time.time()

HELP_CHART_URL = "https://commons.wikimedia.org/wiki/Special:FilePath/Asl_alphabet_gallaudet_ann.png"

# ================== Helpers ==================
def most_common(q):
    items = [x for x in q if x is not None]
    if not items:
        return None, 0
    c = Counter(items).most_common(1)[0]
    return c[0], c[1]

def reset_word():
    global current_word, history
    current_word = []
    history.clear()

def reset_sentence():
    global sentence, current_word, history
    sentence = []
    current_word = []
    history.clear()

# ================== Video Frame Handler ==================
def process_frame(frame: np.ndarray):
    """
    FastRTC handler: receives RGB (H, W, 3); returns annotated RGB.
    Keeps running state in globals so words persist across frames.
    """
    global history, current_word, sentence, last_commit_time

    if frame is None:
        return None

    # mirror like a selfie camera
    frame = cv2.flip(frame, 1)

    # run detection
    try:
        res = model.predict(frame, verbose=False, conf=CONF_THRES)
    except Exception as e:
        cv2.putText(frame, f"Model error: {e}", (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
        return frame

    letter, conf = None, 0.0
    if res and len(res[0].boxes) > 0:
        scores = res[0].boxes.conf.cpu().numpy()
        k = scores.argmax()
        idx = int(res[0].boxes.cls[k].item())
        conf = float(scores[k])
        cand = id2label.get(idx, "").upper()
        if cand in VALID:
            letter = cand

    history.append(letter)
    top, count = most_common(history)

    now = time.time()
    if top and count >= COMMIT_STREAK:
        current_word.append(top)
        history.clear()
        last_commit_time = now

    if (top is None) and (now - last_commit_time > WORD_BREAK_SECONDS) and current_word:
        sentence.append("".join(current_word))
        current_word = []
        last_commit_time = now

    # ---------- overlay ----------
    h, w = frame.shape[:2]
    overlay = frame.copy()
    cv2.rectangle(overlay, (0, 0), (w, 120), (0, 0, 0), -1)       # header bar
    frame = cv2.addWeighted(overlay, 0.55, frame, 0.45, 0)

    label = top if top else "—"
    cv2.putText(frame, f"Letter: {label}  (stability {count}/{COMMIT_STREAK})",
                (16, 64), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2)

    pct = int((w - 32) * (min(1.0, count/COMMIT_STREAK) if top else 0.0))
    cv2.rectangle(frame, (16, 74), (16 + pct, 84), (255, 255, 255), -1)

    word_str = "".join(current_word)
    sent_str = " ".join(sentence[-12:]) if sentence else ""
    cv2.putText(frame, f"Word: {word_str} | Sentence: {sent_str}",
                (16, 104), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 1)

    return frame

# ================== Gradio UI ==================
THEME = gr.themes.Soft(
    primary_hue="blue",
    neutral_hue="slate"
)

CSS = """
/* container width and centering */
#app {max-width: 980px; margin: 0 auto;}
/* sticky title */
.sticky-head {position: sticky; top: 0; z-index: 5; backdrop-filter: blur(6px);}
.sticky-head > * {margin: 0 !important;}
/* webcam panel */
.webrtc-wrap {display:flex; justify-content:center;}
.webrtc-wrap .wrap {width: 100%;}
/* shrink the help chart */
.help-chart img {max-height: 360px; width: auto;}
/* tighter accordions */
.gr-accordion .label {font-weight: 600;}
/* mobile niceties */
@media (max-width: 768px){
  .help-chart img {max-height: 240px;}
}
"""

with gr.Blocks(title="SpellTalk — ASL fingerspelling (live)", theme=THEME, css=CSS, elem_id="app") as demo:
    with gr.Column(elem_classes="sticky-head"):
        gr.Markdown("## 🖐️ SpellTalk – ASL fingerspelling → words (live stream)")
        gr.Markdown(
            "Click **Start camera** below. Hold one hand in frame, shoulder-high, palm per the chart. "
            "When a letter is stable, it locks into the **Word**. Pause briefly to end a word."
        )

    with gr.Row():
        with gr.Column(scale=3):
            with gr.Group(elem_classes="webrtc-wrap"):
                webrtc = WebRTC(label="Webcam (live)", mode="send-receive", modality="video")
            with gr.Row():
                clear_word = gr.Button("Clear word")
                clear_sent = gr.Button("Clear sentence")

            with gr.Accordion("Settings", open=False):
                th = gr.Slider(0.05, 0.9, value=CONF_THRES, step=0.05, label="Confidence threshold")
                hist = gr.Slider(3, 21, value=HIST_LEN, step=1, label="History length (frames)")
                streak = gr.Slider(2, 12, value=COMMIT_STREAK, step=1, label="Stability to lock (frames)")
                pause = gr.Slider(0.3, 2.5, value=WORD_BREAK_SECONDS, step=0.1, label="Pause → new word (seconds)")

        with gr.Column(scale=2):
            with gr.Group():
                gr.Markdown("**ASL alphabet (reference)**")
                gr.Image(value=HELP_CHART_URL, show_label=False, elem_classes="help-chart", interactive=False)
            gr.Markdown(
                "Tips:\n"
                "- Good, even lighting; plain background.\n"
                "- Keep hand within the frame; avoid fast motion.\n"
                "- Letters like **J**/**Z** need motion; this model is tuned mostly for A–Y."
            )

    # Wire up runtime updates for global settings
    def _set_conf(v):
        global CONF_THRES; CONF_THRES = float(v)
    def _set_hist(v):
        global HIST_LEN, history; HIST_LEN = int(v); history = deque(history, maxlen=HIST_LEN)
    def _set_streak(v):
        global COMMIT_STREAK; COMMIT_STREAK = int(v)
    def _set_pause(v):
        global WORD_BREAK_SECONDS; WORD_BREAK_SECONDS = float(v)

    th.change(_set_conf, th, None)
    hist.change(_set_hist, hist, None)
    streak.change(_set_streak, streak, None)
    pause.change(_set_pause, pause, None)

    clear_word.click(lambda: reset_word(), None, None)
    clear_sent.click(lambda: reset_sentence(), None, None)

    # Stream hookup (10 min per session keeps Spaces happy)
    webrtc.stream(fn=process_frame, inputs=[webrtc], outputs=[webrtc], time_limit=600)

if __name__ == "__main__":
    demo.queue()
    demo.launch()