Spaces:
Running
Running
File size: 5,353 Bytes
ed5073f 6bf42f7 ed5073f 776ce26 ed5073f 7df1f0a ed5073f f552d94 ed5073f f552d94 05e228c 7df1f0a 05e228c 776ce26 6bf42f7 776ce26 7df1f0a f552d94 7df1f0a ed5073f 776ce26 ed5073f 05e228c c450334 05e228c 7df1f0a c450334 7df1f0a c450334 05e228c 776ce26 05e228c ed5073f 43cfccd 05e228c f552d94 ed5073f 05e228c f552d94 ed5073f a8e25e5 05e228c a8e25e5 6bf42f7 05e228c ed5073f 05e228c 43cfccd 05e228c 01ab6f8 05e228c ed5073f 05e228c ed5073f 05e228c ed5073f 05e228c ed5073f c450334 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import os
import shutil
import gradio as gr
from gradio_client import Client, handle_file
from huggingface_hub import hf_hub_download
# 1. SECRETS & BACKEND LINK
HF_TOKEN = os.environ.get("HF_TOKEN")
PRIVATE_SPACE = "st192011/ASL-VLS-Private"
# 2. DEFINITIVE SUPPORTED VOCABULARY LIST
SUPPORTED_VIDEOS = [
("00944", "ADAPT"), ("00963", "ADD"), ("01064", "ADJECTIVE"), ("00335", "ABDOMEN"),
("00689", "ACCOUNTANT"), ("00899", "ACTOR"), ("00584", "ACCENT"), ("00632", "ACCIDENT"),
("00586", "ACCENT"), ("00585", "ACCENT"), ("00626", "ACCIDENT"), ("00623", "ACCIDENT"),
("00846", "ACT"), ("00890", "ACTIVITY"), ("00898", "ACTOR"), ("01011", "ADDRESS"),
("00834", "ACROSS"), ("00624", "ACCIDENT"), ("00593", "ACCEPT"), ("00415", "ABOUT"),
("00961", "ADD"), ("00962", "ADD"), ("00594", "ACCEPT"), ("00964", "ADD"),
("00666", "ACCOMPLISH"), ("01065", "ADJECTIVE"), ("00628", "ACCIDENT"), ("00868", "ACTIVE"),
("00836", "ACROSS"), ("00430", "ABOVE"), ("00835", "ACROSS"), ("00946", "ADAPT"),
("00943", "ADAPT"), ("00414", "ABOUT"), ("00376", "ABLE"), ("00832", "ACROSS"),
("00627", "ACCIDENT"), ("00592", "ACCEPT"), ("00625", "ACCIDENT"), ("01012", "ADDRESS"),
("00849", "ACT"), ("00663", "ACCOMPLISH"), ("00853", "ACTION"), ("00967", "ADD"),
("00692", "ACCOUNTANT"), ("00583", "ACCENT"), ("00341", "ACROSS"), ("00378", "ADDRESS"),
("00433", "ADJECTIVE"), ("00384", "ACTOR"), ("00381", "ACTOR"), ("00377", "ACCIDENT"),
("00382", "ACTOR"), ("00378", "ADDRESS")
]
SUPPORTED_GLOSSES_UNIQUE = sorted(list(set([g for _, g in SUPPORTED_VIDEOS])))
dataset_options = {f"{g} (Sample {vid})": f"data/data_0/{vid.zfill(5)}.mp4" for vid, g in SUPPORTED_VIDEOS}
# 3. INITIALIZE CLIENT
try:
client = Client(PRIVATE_SPACE, token=HF_TOKEN)
except Exception as e:
client = None
# 4. LOGIC FUNCTIONS
def update_video_display(selection):
if not selection: return None, None
try:
gloss_gt = selection.split('(')[0].strip()
hf_path = dataset_options[selection]
cache_path = hf_hub_download(repo_id="Voxel51/WLASL", filename=hf_path, repo_type="dataset")
local_path = os.path.join("/tmp", os.path.basename(hf_path))
shutil.copy(cache_path, local_path)
return local_path, f"Ground Truth: {gloss_gt}"
except:
return None, "Error loading sample"
def run_omnisign_vlm(video_path):
if not video_path: return {"โ ๏ธ No Input": 0.0}
if not client: return {"โ ๏ธ Engine Offline": 0.0}
try:
# Request from Private Space
result = client.predict(
handle_file(video_path),
api_name="/predict_sign"
)
# --- PARSING LOGIC ---
if isinstance(result, dict) and "confidences" in result:
return {item['label']: item['confidence'] for item in result["confidences"]}
return result
except Exception as e:
return {f"โ Neural Analysis Error": 0.0}
# 5. UI DESIGN (PROFESSIONAL PITCH)
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(f"""
# ๐ง OmniSign VLM: Neural Universal SL Protocol
### **Powered by Multimodal Temporal Reasoning**
This demonstration showcases a revolutionary **Structural Protocol** for sign language interpretation, powered by a Large Vision-Language Model (VLM) core. Our protocol focuses on extracting pure **kinetic semantics** from video streams.
**The OmniSign Protocol's Unique Advantages:**
1. **Motion-Oriented Core:** The system is designed to analyze the physics and trajectory of movement, rendering the prediction robust against variations in the signer, lighting, or background environment.
2. **Lexical Agnosticism:** The underlying VLM protocol is language-independent. It can be instantly updated to recognize new signs or expanded to include any sign language (e.g., ASL, BSL, LSF) with unparalleled efficiency.
3. **Future-Proof Scalability:** New vocabulary can be integrated into the system's lexicon instantly, bypassing traditional, time-consuming retraining cycles.
---
This engine is currently not yet **fully optimized for predictive accuracy** and operates on a limited vocabulary. Its sole purpose is to demonstrate a **highly versatile and scalable VLM protocol**.
""")
with gr.Row():
with gr.Column():
gr.Markdown("### ๐ฆ 1. Input Source")
video_comp = gr.Video(label="Video Buffer", autoplay=True)
dataset_drop = gr.Dropdown(choices=[""] + sorted(list(dataset_options.keys())), label="Browse WLASL Archive")
gt_output = gr.Textbox(label="Ground Truth Reference", interactive=False)
run_btn = gr.Button("๐ Execute Neural Analysis", variant="primary")
with gr.Column():
gr.Markdown("### ๐ 2. VLM Perception Result")
output_label = gr.Label(num_top_classes=3, label="Confidence Score")
with gr.Accordion("๐ Supported Vocabulary List", open=True):
gr.Markdown(", ".join(SUPPORTED_GLOSSES_UNIQUE))
dataset_drop.change(fn=update_video_display, inputs=dataset_drop, outputs=[video_comp, gt_output])
run_btn.click(fn=run_omnisign_vlm, inputs=video_comp, outputs=output_label)
if __name__ == "__main__":
demo.launch(ssr_mode=False) |