Spaces:

st192011
/

ASL-VLM-Protocol

Sleeping

App Files Files Community

st192011 commited on Feb 1

Commit

776ce26

verified ·

1 Parent(s): 6d33610

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -85

app.py CHANGED Viewed

@@ -2,11 +2,10 @@ import os
 import shutil
 import gradio as gr
 from gradio_client import Client, handle_file
-from huggingface_hub import hf_hub_download, list_repo_files
 # 1. SECRETS & BACKEND LINK
 HF_TOKEN = os.environ.get("HF_TOKEN")
-# Make sure this matches your private space URL exactly
 PRIVATE_SPACE = "st192011/ASL-VLS-Private"
 # 2. DEFINITIVE SUPPORTED VOCABULARY LIST
@@ -22,120 +21,62 @@ SUPPORTED_VIDEOS = [
     ("00943", "ADAPT"), ("00414", "ABOUT"), ("00376", "ABLE"), ("00832", "ACROSS"),
     ("00627", "ACCIDENT"), ("00592", "ACCEPT"), ("00625", "ACCIDENT"), ("01012", "ADDRESS"),
     ("00849", "ACT"), ("00663", "ACCOMPLISH"), ("00853", "ACTION"), ("00967", "ADD"),
-    ("00692", "ACCOUNTANT"), ("00583", "ACCENT"), ("00341", "ACROSS"), ("00378", "ADDRESS"),
-    ("00433", "ADJECTIVE"), ("00384", "ACTOR"), ("00381", "ACTOR"), ("00377", "ACCIDENT"),
-    ("00382", "ACTOR"), ("00378", "ADDRESS")
 ]
-SUPPORTED_GLOSSES_UNIQUE = sorted(list(set([g for _, g in SUPPORTED_VIDEOS])))
-# 3. DATASET DISCOVERY AND MAPPING
-print("Dataset Discovery: Mapping specific video IDs to Glosses...")
-dataset_options = {}
-for vid_id, gloss in SUPPORTED_VIDEOS:
-    # Construct the full HF path (assuming 5-digit ID)
-    hf_path = f"data/data_0/{vid_id.zfill(5)}.mp4"
-    display_name = f"{gloss} (Sample {vid_id})"
-    dataset_options[display_name] = hf_path
-# 4. INITIALIZE CLIENT
-print(f"🔌 Attempting connection to {PRIVATE_SPACE}...")
 try:
-    # Use 'token=' (standard) instead of 'hf_token='
     client = Client(PRIVATE_SPACE, token=HF_TOKEN)
-    print("✅ Neural Engine Online!")
 except Exception as e:
-    print(f"❌ Connection Failed: {e}")
     client = None
-# 5. LOGIC FUNCTIONS
 def update_video_display(selection):
-    if not selection: return None, None
     try:
-        gloss_gt = selection.split('(')[0].strip()
-        # Download the video file to /tmp for local playback
         hf_path = dataset_options[selection]
         cache_path = hf_hub_download(repo_id="Voxel51/WLASL", filename=hf_path, repo_type="dataset")
         local_path = os.path.join("/tmp", os.path.basename(hf_path))
         shutil.copy(cache_path, local_path)
-        return local_path, f"Ground Truth: {gloss_gt}"
-    except Exception as e:
-        return None, f"Error downloading sample: {e}"
 def run_omnisign_vlm(video_path):
-    if not video_path: return {"⚠️ No Input": 0.0}
-    if not client: return {"⚠️ Engine Offline": 0.0}
     try:
-        # 1. Get raw result from Private Space
-        result = client.predict(
             handle_file(video_path),
             api_name="/predict_sign"
         )
-        # 2. DEBUG: Check if result is the raw Label structure
-        # If the result has 'confidences', we must extract them.
-        if isinstance(result, dict) and "confidences" in result:
-            # Transform complex list -> simple dict for the UI
-            # From: {'confidences': [{'label': 'A', 'confidence': 0.9}, ...]}
-            # To:   {'A': 0.9, ...}
-            clean_output = {
-                item['label']: item['confidence']
-                for item in result["confidences"]
-            }
-            return clean_output
-        # 3. Fallback: If it's already a simple dict, return it
-        return result
     except Exception as e:
-        return {f"❌ Error: {str(e)}": 0.0}
-# 6. UI DESIGN
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown(f"""
-    # 🧠 OmniSign VLM: Neural Universal SL Protocol
-    ### **Powered by Multimodal Temporal Reasoning**
-    This demonstration showcases a revolutionary **Structural Protocol** for sign language interpretation, powered by a Large Vision-Language Model (VLM) core. Our protocol focuses on extracting pure **kinetic semantics** from video streams.
-    **The OmniSign Protocol's Unique Advantages:**
-    1.  **Motion-Oriented Core:** The system is designed to analyze the physics and trajectory of movement, rendering the prediction robust against variations in the signer, lighting, or background environment.
-    2.  **Lexical Agnosticism:** The underlying VLM protocol is language-independent. It can be instantly updated to recognize new signs or expanded to include any sign language (e.g., ASL, BSL, LSF) with unparalleled efficiency.
-    3.  **Future-Proof Scalability:** New vocabulary can be integrated into the system's lexicon instantly, bypassing traditional, time-consuming retraining cycles.
-    ---
-    *Notice: This is a structural proof-of-concept. The current engine is unoptimized and operates on a limited vocabulary subset to showcase the protocol's power.*
-    """)
     with gr.Row():
         with gr.Column():
-            gr.Markdown("### 🎦 1. Select Input")
-            video_comp = gr.Video(label="Video Buffer: Record or Upload", autoplay=True)
-            dataset_drop = gr.Dropdown(
-                choices=[""] + sorted(list(dataset_options.keys())),
-                label="Explore WLASL Samples (Verified Support)",
-                value=""
-            )
-            gt_output = gr.Textbox(label="Ground Truth", interactive=False, value="Select a sample above to view its Ground Truth.")
-            run_btn = gr.Button("🚀 Execute Neural Analysis", variant="primary")
         with gr.Column():
-            gr.Markdown("### 📊 2. VLM Perception Output")
-            output_label = gr.Label(num_top_classes=3, label="VLM Confidence Output")
-            with gr.Accordion("🔍 View Supported Vocabulary List", open=True):
-                gr.Markdown(f"**This demo subset recognizes {len(SUPPORTED_GLOSSES_UNIQUE)} unique words:**")
-                gr.Markdown(", ".join(SUPPORTED_GLOSSES_UNIQUE))
-    # Event Mapping
-    dataset_drop.change(fn=update_video_display, inputs=dataset_drop, outputs=[video_comp, gt_output])
-    run_btn.click(fn=run_omnisign_vlm, inputs=video_comp, outputs=output_label)
 if __name__ == "__main__":
     demo.launch(ssr_mode=False)

 import shutil
 import gradio as gr
 from gradio_client import Client, handle_file
+from huggingface_hub import hf_hub_download
 # 1. SECRETS & BACKEND LINK
 HF_TOKEN = os.environ.get("HF_TOKEN")
 PRIVATE_SPACE = "st192011/ASL-VLS-Private"
 # 2. DEFINITIVE SUPPORTED VOCABULARY LIST
     ("00943", "ADAPT"), ("00414", "ABOUT"), ("00376", "ABLE"), ("00832", "ACROSS"),
     ("00627", "ACCIDENT"), ("00592", "ACCEPT"), ("00625", "ACCIDENT"), ("01012", "ADDRESS"),
     ("00849", "ACT"), ("00663", "ACCOMPLISH"), ("00853", "ACTION"), ("00967", "ADD"),
+    ("00692", "ACCOUNTANT"), ("00583", "ACCENT")
 ]
+dataset_options = {f"{g} (Sample {vid})": f"data/data_0/{vid.zfill(5)}.mp4" for vid, g in SUPPORTED_VIDEOS}
+# 3. INITIALIZE CLIENT
 try:
     client = Client(PRIVATE_SPACE, token=HF_TOKEN)
 except Exception as e:
     client = None
+# 4. LOGIC FUNCTIONS
 def update_video_display(selection):
+    if not selection: return None
     try:
         hf_path = dataset_options[selection]
         cache_path = hf_hub_download(repo_id="Voxel51/WLASL", filename=hf_path, repo_type="dataset")
         local_path = os.path.join("/tmp", os.path.basename(hf_path))
         shutil.copy(cache_path, local_path)
+        return local_path
+    except:
+        return None
 def run_omnisign_vlm(video_path):
+    if not video_path: return "No Input"
+    if not client: return "Engine Offline"
     try:
+        # PURE INQUIRY: Capture the raw response from the Private Space
+        raw_result = client.predict(
             handle_file(video_path),
             api_name="/predict_sign"
         )
+        # RETURN RAW DATA FOR INSPECTION
+        # This will show the type and the content (e.g., <class 'tuple'> : ({"ADAPT": 0.9},))
+        return f"TYPE: {type(raw_result)}\n\nCONTENT: {raw_result}"
     except Exception as e:
+        return f"API ERROR: {str(e)}"
+# 5. UI DESIGN
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🧠 Diagnostic Mode: OmniSign VLM")
     with gr.Row():
         with gr.Column():
+            video_comp = gr.Video(label="Input Buffer", autoplay=True)
+            dataset_drop = gr.Dropdown(choices=[""] + sorted(list(dataset_options.keys())), label="Select Sample")
+            run_btn = gr.Button("Analyze Raw Response", variant="primary")
         with gr.Column():
+            # CHANGED: Using Textbox instead of Label to see the raw data structure
+            raw_output_box = gr.Textbox(label="Raw Data Received from Private Space", lines=10)
+    dataset_drop.change(fn=update_video_display, inputs=dataset_drop, outputs=video_comp)
+    run_btn.click(fn=run_omnisign_vlm, inputs=video_comp, outputs=raw_output_box)
 if __name__ == "__main__":
     demo.launch(ssr_mode=False)