Spaces:
Sleeping
Sleeping
| import os | |
| import shutil | |
| import gradio as gr | |
| from gradio_client import Client, handle_file | |
| from huggingface_hub import hf_hub_download | |
| # 1. SECRETS & BACKEND LINK | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| PRIVATE_SPACE = "st192011/ASL-VLS-Private" | |
| # 2. DEFINITIVE SUPPORTED VOCABULARY LIST | |
| SUPPORTED_VIDEOS = [ | |
| ("00944", "ADAPT"), ("00963", "ADD"), ("01064", "ADJECTIVE"), ("00335", "ABDOMEN"), | |
| ("00689", "ACCOUNTANT"), ("00899", "ACTOR"), ("00584", "ACCENT"), ("00632", "ACCIDENT"), | |
| ("00586", "ACCENT"), ("00585", "ACCENT"), ("00626", "ACCIDENT"), ("00623", "ACCIDENT"), | |
| ("00846", "ACT"), ("00890", "ACTIVITY"), ("00898", "ACTOR"), ("01011", "ADDRESS"), | |
| ("00834", "ACROSS"), ("00624", "ACCIDENT"), ("00593", "ACCEPT"), ("00415", "ABOUT"), | |
| ("00961", "ADD"), ("00962", "ADD"), ("00594", "ACCEPT"), ("00964", "ADD"), | |
| ("00666", "ACCOMPLISH"), ("01065", "ADJECTIVE"), ("00628", "ACCIDENT"), ("00868", "ACTIVE"), | |
| ("00836", "ACROSS"), ("00430", "ABOVE"), ("00835", "ACROSS"), ("00946", "ADAPT"), | |
| ("00943", "ADAPT"), ("00414", "ABOUT"), ("00376", "ABLE"), ("00832", "ACROSS"), | |
| ("00627", "ACCIDENT"), ("00592", "ACCEPT"), ("00625", "ACCIDENT"), ("01012", "ADDRESS"), | |
| ("00849", "ACT"), ("00663", "ACCOMPLISH"), ("00853", "ACTION"), ("00967", "ADD"), | |
| ("00692", "ACCOUNTANT"), ("00583", "ACCENT"), ("00341", "ACROSS"), ("00378", "ADDRESS"), | |
| ("00433", "ADJECTIVE"), ("00384", "ACTOR"), ("00381", "ACTOR"), ("00377", "ACCIDENT"), | |
| ("00382", "ACTOR"), ("00378", "ADDRESS") | |
| ] | |
| SUPPORTED_GLOSSES_UNIQUE = sorted(list(set([g for _, g in SUPPORTED_VIDEOS]))) | |
| dataset_options = {f"{g} (Sample {vid})": f"data/data_0/{vid.zfill(5)}.mp4" for vid, g in SUPPORTED_VIDEOS} | |
| # 3. INITIALIZE CLIENT | |
| try: | |
| client = Client(PRIVATE_SPACE, token=HF_TOKEN) | |
| except Exception as e: | |
| client = None | |
| # 4. LOGIC FUNCTIONS | |
| def update_video_display(selection): | |
| if not selection: return None, None | |
| try: | |
| gloss_gt = selection.split('(')[0].strip() | |
| hf_path = dataset_options[selection] | |
| cache_path = hf_hub_download(repo_id="Voxel51/WLASL", filename=hf_path, repo_type="dataset") | |
| local_path = os.path.join("/tmp", os.path.basename(hf_path)) | |
| shutil.copy(cache_path, local_path) | |
| return local_path, f"Ground Truth: {gloss_gt}" | |
| except: | |
| return None, "Error loading sample" | |
| def run_omnisign_vlm(video_path): | |
| if not video_path: return {"β οΈ No Input": 0.0} | |
| if not client: return {"β οΈ Engine Offline": 0.0} | |
| try: | |
| # Request from Private Space | |
| result = client.predict( | |
| handle_file(video_path), | |
| api_name="/predict_sign" | |
| ) | |
| # --- PARSING LOGIC --- | |
| if isinstance(result, dict) and "confidences" in result: | |
| return {item['label']: item['confidence'] for item in result["confidences"]} | |
| return result | |
| except Exception as e: | |
| return {f"β Neural Analysis Error": 0.0} | |
| # 5. UI DESIGN (PROFESSIONAL PITCH) | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(f""" | |
| # π§ OmniSign VLM: Neural Universal SL Protocol | |
| ### **Powered by Multimodal Temporal Reasoning** | |
| This demonstration showcases a revolutionary **Structural Protocol** for sign language interpretation, powered by a Large Vision-Language Model (VLM) core. Our protocol focuses on extracting pure **kinetic semantics** from video streams. | |
| **The OmniSign Protocol's Unique Advantages:** | |
| 1. **Motion-Oriented Core:** The system is designed to analyze the physics and trajectory of movement, rendering the prediction robust against variations in the signer, lighting, or background environment. | |
| 2. **Lexical Agnosticism:** The underlying VLM protocol is language-independent. It can be instantly updated to recognize new signs or expanded to include any sign language (e.g., ASL, BSL, LSF) with unparalleled efficiency. | |
| 3. **Future-Proof Scalability:** New vocabulary can be integrated into the system's lexicon instantly, bypassing traditional, time-consuming retraining cycles. | |
| --- | |
| This engine is currently not yet **fully optimized for predictive accuracy** and operates on a limited vocabulary. Its sole purpose is to demonstrate a **highly versatile and scalable VLM protocol**. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### π¦ 1. Input Source") | |
| video_comp = gr.Video(label="Video Buffer", autoplay=True) | |
| dataset_drop = gr.Dropdown(choices=[""] + sorted(list(dataset_options.keys())), label="Browse WLASL Archive") | |
| gt_output = gr.Textbox(label="Ground Truth Reference", interactive=False) | |
| run_btn = gr.Button("π Execute Neural Analysis", variant="primary") | |
| with gr.Column(): | |
| gr.Markdown("### π 2. VLM Perception Result") | |
| output_label = gr.Label(num_top_classes=3, label="Confidence Score") | |
| with gr.Accordion("π Supported Vocabulary List", open=True): | |
| gr.Markdown(", ".join(SUPPORTED_GLOSSES_UNIQUE)) | |
| dataset_drop.change(fn=update_video_display, inputs=dataset_drop, outputs=[video_comp, gt_output]) | |
| run_btn.click(fn=run_omnisign_vlm, inputs=video_comp, outputs=output_label) | |
| if __name__ == "__main__": | |
| demo.launch(ssr_mode=False) |