Spaces:

st192011
/

ASL-VLM-Protocol

Running

File size: 5,353 Bytes

ed5073f
6bf42f7
ed5073f
 
776ce26
ed5073f
7df1f0a
ed5073f
f552d94
ed5073f
f552d94
 
 
 
 
 
 
 
 
 
 
 
 
05e228c
 
 
7df1f0a
05e228c
776ce26
6bf42f7
776ce26
7df1f0a
f552d94
 
7df1f0a
ed5073f
776ce26
ed5073f
05e228c
c450334
05e228c
7df1f0a
c450334
7df1f0a
c450334
05e228c
776ce26
05e228c
ed5073f
43cfccd
05e228c
 
f552d94
ed5073f
05e228c
 
f552d94
 
ed5073f
a8e25e5
05e228c
 
 
 
 
a8e25e5
6bf42f7
05e228c
ed5073f
05e228c
43cfccd
05e228c
 
 
 
 
 
 
 
 
 
 
 
 
01ab6f8
05e228c
ed5073f
 
 
05e228c
 
 
 
 
ed5073f
 
05e228c
 
 
 
 
ed5073f
05e228c
 
ed5073f
 
c450334

import os
import shutil
import gradio as gr
from gradio_client import Client, handle_file
from huggingface_hub import hf_hub_download

# 1. SECRETS & BACKEND LINK
HF_TOKEN = os.environ.get("HF_TOKEN")
PRIVATE_SPACE = "st192011/ASL-VLS-Private" 

# 2. DEFINITIVE SUPPORTED VOCABULARY LIST
SUPPORTED_VIDEOS = [
    ("00944", "ADAPT"), ("00963", "ADD"), ("01064", "ADJECTIVE"), ("00335", "ABDOMEN"),
    ("00689", "ACCOUNTANT"), ("00899", "ACTOR"), ("00584", "ACCENT"), ("00632", "ACCIDENT"),
    ("00586", "ACCENT"), ("00585", "ACCENT"), ("00626", "ACCIDENT"), ("00623", "ACCIDENT"),
    ("00846", "ACT"), ("00890", "ACTIVITY"), ("00898", "ACTOR"), ("01011", "ADDRESS"),
    ("00834", "ACROSS"), ("00624", "ACCIDENT"), ("00593", "ACCEPT"), ("00415", "ABOUT"),
    ("00961", "ADD"), ("00962", "ADD"), ("00594", "ACCEPT"), ("00964", "ADD"), 
    ("00666", "ACCOMPLISH"), ("01065", "ADJECTIVE"), ("00628", "ACCIDENT"), ("00868", "ACTIVE"), 
    ("00836", "ACROSS"), ("00430", "ABOVE"), ("00835", "ACROSS"), ("00946", "ADAPT"), 
    ("00943", "ADAPT"), ("00414", "ABOUT"), ("00376", "ABLE"), ("00832", "ACROSS"), 
    ("00627", "ACCIDENT"), ("00592", "ACCEPT"), ("00625", "ACCIDENT"), ("01012", "ADDRESS"), 
    ("00849", "ACT"), ("00663", "ACCOMPLISH"), ("00853", "ACTION"), ("00967", "ADD"), 
    ("00692", "ACCOUNTANT"), ("00583", "ACCENT"), ("00341", "ACROSS"), ("00378", "ADDRESS"),
    ("00433", "ADJECTIVE"), ("00384", "ACTOR"), ("00381", "ACTOR"), ("00377", "ACCIDENT"),
    ("00382", "ACTOR"), ("00378", "ADDRESS")
]
SUPPORTED_GLOSSES_UNIQUE = sorted(list(set([g for _, g in SUPPORTED_VIDEOS])))
dataset_options = {f"{g} (Sample {vid})": f"data/data_0/{vid.zfill(5)}.mp4" for vid, g in SUPPORTED_VIDEOS}

# 3. INITIALIZE CLIENT
try:
    client = Client(PRIVATE_SPACE, token=HF_TOKEN)
except Exception as e:
    client = None

# 4. LOGIC FUNCTIONS
def update_video_display(selection):
    if not selection: return None, None
    try:
        gloss_gt = selection.split('(')[0].strip()
        hf_path = dataset_options[selection]
        cache_path = hf_hub_download(repo_id="Voxel51/WLASL", filename=hf_path, repo_type="dataset")
        local_path = os.path.join("/tmp", os.path.basename(hf_path))
        shutil.copy(cache_path, local_path)
        return local_path, f"Ground Truth: {gloss_gt}"
    except:
        return None, "Error loading sample"

def run_omnisign_vlm(video_path):
    if not video_path: return {"⚠️ No Input": 0.0}
    if not client: return {"⚠️ Engine Offline": 0.0}

    try:
        # Request from Private Space
        result = client.predict(
            handle_file(video_path),
            api_name="/predict_sign"
        )
        
        # --- PARSING LOGIC ---
        if isinstance(result, dict) and "confidences" in result:
            return {item['label']: item['confidence'] for item in result["confidences"]}
        
        return result

    except Exception as e:
        return {f"❌ Neural Analysis Error": 0.0}

# 5. UI DESIGN (PROFESSIONAL PITCH)
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(f"""
    # 🧠 OmniSign VLM: Neural Universal SL Protocol
    ### **Powered by Multimodal Temporal Reasoning**

    This demonstration showcases a revolutionary **Structural Protocol** for sign language interpretation, powered by a Large Vision-Language Model (VLM) core. Our protocol focuses on extracting pure **kinetic semantics** from video streams.

    **The OmniSign Protocol's Unique Advantages:**

    1.  **Motion-Oriented Core:** The system is designed to analyze the physics and trajectory of movement, rendering the prediction robust against variations in the signer, lighting, or background environment.
    2.  **Lexical Agnosticism:** The underlying VLM protocol is language-independent. It can be instantly updated to recognize new signs or expanded to include any sign language (e.g., ASL, BSL, LSF) with unparalleled efficiency.
    3.  **Future-Proof Scalability:** New vocabulary can be integrated into the system's lexicon instantly, bypassing traditional, time-consuming retraining cycles.

    ---
    This engine is currently not yet **fully optimized for predictive accuracy** and operates on a limited vocabulary. Its sole purpose is to demonstrate a **highly versatile and scalable VLM protocol**. 
    """)
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 🎦 1. Input Source")
            video_comp = gr.Video(label="Video Buffer", autoplay=True)
            dataset_drop = gr.Dropdown(choices=[""] + sorted(list(dataset_options.keys())), label="Browse WLASL Archive")
            gt_output = gr.Textbox(label="Ground Truth Reference", interactive=False)
            run_btn = gr.Button("🚀 Execute Neural Analysis", variant="primary")
            
        with gr.Column():
            gr.Markdown("### 📊 2. VLM Perception Result")
            output_label = gr.Label(num_top_classes=3, label="Confidence Score")
            
            with gr.Accordion("🔍 Supported Vocabulary List", open=True):
                gr.Markdown(", ".join(SUPPORTED_GLOSSES_UNIQUE))

    dataset_drop.change(fn=update_video_display, inputs=dataset_drop, outputs=[video_comp, gt_output])
    run_btn.click(fn=run_omnisign_vlm, inputs=video_comp, outputs=output_label)

if __name__ == "__main__":
    demo.launch(ssr_mode=False)