st192011 commited on
Commit
776ce26
Β·
verified Β·
1 Parent(s): 6d33610

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -85
app.py CHANGED
@@ -2,11 +2,10 @@ import os
2
  import shutil
3
  import gradio as gr
4
  from gradio_client import Client, handle_file
5
- from huggingface_hub import hf_hub_download, list_repo_files
6
 
7
  # 1. SECRETS & BACKEND LINK
8
  HF_TOKEN = os.environ.get("HF_TOKEN")
9
- # Make sure this matches your private space URL exactly
10
  PRIVATE_SPACE = "st192011/ASL-VLS-Private"
11
 
12
  # 2. DEFINITIVE SUPPORTED VOCABULARY LIST
@@ -22,120 +21,62 @@ SUPPORTED_VIDEOS = [
22
  ("00943", "ADAPT"), ("00414", "ABOUT"), ("00376", "ABLE"), ("00832", "ACROSS"),
23
  ("00627", "ACCIDENT"), ("00592", "ACCEPT"), ("00625", "ACCIDENT"), ("01012", "ADDRESS"),
24
  ("00849", "ACT"), ("00663", "ACCOMPLISH"), ("00853", "ACTION"), ("00967", "ADD"),
25
- ("00692", "ACCOUNTANT"), ("00583", "ACCENT"), ("00341", "ACROSS"), ("00378", "ADDRESS"),
26
- ("00433", "ADJECTIVE"), ("00384", "ACTOR"), ("00381", "ACTOR"), ("00377", "ACCIDENT"),
27
- ("00382", "ACTOR"), ("00378", "ADDRESS")
28
  ]
29
- SUPPORTED_GLOSSES_UNIQUE = sorted(list(set([g for _, g in SUPPORTED_VIDEOS])))
30
 
31
- # 3. DATASET DISCOVERY AND MAPPING
32
- print("Dataset Discovery: Mapping specific video IDs to Glosses...")
33
- dataset_options = {}
34
- for vid_id, gloss in SUPPORTED_VIDEOS:
35
- # Construct the full HF path (assuming 5-digit ID)
36
- hf_path = f"data/data_0/{vid_id.zfill(5)}.mp4"
37
- display_name = f"{gloss} (Sample {vid_id})"
38
- dataset_options[display_name] = hf_path
39
-
40
- # 4. INITIALIZE CLIENT
41
- print(f"πŸ”Œ Attempting connection to {PRIVATE_SPACE}...")
42
  try:
43
- # Use 'token=' (standard) instead of 'hf_token='
44
  client = Client(PRIVATE_SPACE, token=HF_TOKEN)
45
- print("βœ… Neural Engine Online!")
46
  except Exception as e:
47
- print(f"❌ Connection Failed: {e}")
48
  client = None
49
 
50
- # 5. LOGIC FUNCTIONS
51
  def update_video_display(selection):
52
- if not selection: return None, None
53
  try:
54
- gloss_gt = selection.split('(')[0].strip()
55
-
56
- # Download the video file to /tmp for local playback
57
  hf_path = dataset_options[selection]
58
  cache_path = hf_hub_download(repo_id="Voxel51/WLASL", filename=hf_path, repo_type="dataset")
59
  local_path = os.path.join("/tmp", os.path.basename(hf_path))
60
  shutil.copy(cache_path, local_path)
61
-
62
- return local_path, f"Ground Truth: {gloss_gt}"
63
- except Exception as e:
64
- return None, f"Error downloading sample: {e}"
65
 
66
  def run_omnisign_vlm(video_path):
67
- if not video_path: return {"⚠️ No Input": 0.0}
68
- if not client: return {"⚠️ Engine Offline": 0.0}
69
 
70
  try:
71
- # 1. Get raw result from Private Space
72
- result = client.predict(
73
  handle_file(video_path),
74
  api_name="/predict_sign"
75
  )
76
 
77
- # 2. DEBUG: Check if result is the raw Label structure
78
- # If the result has 'confidences', we must extract them.
79
- if isinstance(result, dict) and "confidences" in result:
80
- # Transform complex list -> simple dict for the UI
81
- # From: {'confidences': [{'label': 'A', 'confidence': 0.9}, ...]}
82
- # To: {'A': 0.9, ...}
83
- clean_output = {
84
- item['label']: item['confidence']
85
- for item in result["confidences"]
86
- }
87
- return clean_output
88
-
89
- # 3. Fallback: If it's already a simple dict, return it
90
- return result
91
 
92
  except Exception as e:
93
- return {f"❌ Error: {str(e)}": 0.0}
94
 
95
- # 6. UI DESIGN
96
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
97
- gr.Markdown(f"""
98
- # 🧠 OmniSign VLM: Neural Universal SL Protocol
99
- ### **Powered by Multimodal Temporal Reasoning**
100
-
101
- This demonstration showcases a revolutionary **Structural Protocol** for sign language interpretation, powered by a Large Vision-Language Model (VLM) core. Our protocol focuses on extracting pure **kinetic semantics** from video streams.
102
-
103
- **The OmniSign Protocol's Unique Advantages:**
104
-
105
- 1. **Motion-Oriented Core:** The system is designed to analyze the physics and trajectory of movement, rendering the prediction robust against variations in the signer, lighting, or background environment.
106
- 2. **Lexical Agnosticism:** The underlying VLM protocol is language-independent. It can be instantly updated to recognize new signs or expanded to include any sign language (e.g., ASL, BSL, LSF) with unparalleled efficiency.
107
- 3. **Future-Proof Scalability:** New vocabulary can be integrated into the system's lexicon instantly, bypassing traditional, time-consuming retraining cycles.
108
-
109
- ---
110
- *Notice: This is a structural proof-of-concept. The current engine is unoptimized and operates on a limited vocabulary subset to showcase the protocol's power.*
111
- """)
112
 
113
  with gr.Row():
114
  with gr.Column():
115
- gr.Markdown("### 🎦 1. Select Input")
116
- video_comp = gr.Video(label="Video Buffer: Record or Upload", autoplay=True)
117
-
118
- dataset_drop = gr.Dropdown(
119
- choices=[""] + sorted(list(dataset_options.keys())),
120
- label="Explore WLASL Samples (Verified Support)",
121
- value=""
122
- )
123
-
124
- gt_output = gr.Textbox(label="Ground Truth", interactive=False, value="Select a sample above to view its Ground Truth.")
125
-
126
- run_btn = gr.Button("πŸš€ Execute Neural Analysis", variant="primary")
127
 
128
  with gr.Column():
129
- gr.Markdown("### πŸ“Š 2. VLM Perception Output")
130
- output_label = gr.Label(num_top_classes=3, label="VLM Confidence Output")
131
-
132
- with gr.Accordion("πŸ” View Supported Vocabulary List", open=True):
133
- gr.Markdown(f"**This demo subset recognizes {len(SUPPORTED_GLOSSES_UNIQUE)} unique words:**")
134
- gr.Markdown(", ".join(SUPPORTED_GLOSSES_UNIQUE))
135
 
136
- # Event Mapping
137
- dataset_drop.change(fn=update_video_display, inputs=dataset_drop, outputs=[video_comp, gt_output])
138
- run_btn.click(fn=run_omnisign_vlm, inputs=video_comp, outputs=output_label)
139
 
140
  if __name__ == "__main__":
141
  demo.launch(ssr_mode=False)
 
2
  import shutil
3
  import gradio as gr
4
  from gradio_client import Client, handle_file
5
+ from huggingface_hub import hf_hub_download
6
 
7
  # 1. SECRETS & BACKEND LINK
8
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
9
  PRIVATE_SPACE = "st192011/ASL-VLS-Private"
10
 
11
  # 2. DEFINITIVE SUPPORTED VOCABULARY LIST
 
21
  ("00943", "ADAPT"), ("00414", "ABOUT"), ("00376", "ABLE"), ("00832", "ACROSS"),
22
  ("00627", "ACCIDENT"), ("00592", "ACCEPT"), ("00625", "ACCIDENT"), ("01012", "ADDRESS"),
23
  ("00849", "ACT"), ("00663", "ACCOMPLISH"), ("00853", "ACTION"), ("00967", "ADD"),
24
+ ("00692", "ACCOUNTANT"), ("00583", "ACCENT")
 
 
25
  ]
26
+ dataset_options = {f"{g} (Sample {vid})": f"data/data_0/{vid.zfill(5)}.mp4" for vid, g in SUPPORTED_VIDEOS}
27
 
28
+ # 3. INITIALIZE CLIENT
 
 
 
 
 
 
 
 
 
 
29
  try:
 
30
  client = Client(PRIVATE_SPACE, token=HF_TOKEN)
 
31
  except Exception as e:
 
32
  client = None
33
 
34
+ # 4. LOGIC FUNCTIONS
35
  def update_video_display(selection):
36
+ if not selection: return None
37
  try:
 
 
 
38
  hf_path = dataset_options[selection]
39
  cache_path = hf_hub_download(repo_id="Voxel51/WLASL", filename=hf_path, repo_type="dataset")
40
  local_path = os.path.join("/tmp", os.path.basename(hf_path))
41
  shutil.copy(cache_path, local_path)
42
+ return local_path
43
+ except:
44
+ return None
 
45
 
46
  def run_omnisign_vlm(video_path):
47
+ if not video_path: return "No Input"
48
+ if not client: return "Engine Offline"
49
 
50
  try:
51
+ # PURE INQUIRY: Capture the raw response from the Private Space
52
+ raw_result = client.predict(
53
  handle_file(video_path),
54
  api_name="/predict_sign"
55
  )
56
 
57
+ # RETURN RAW DATA FOR INSPECTION
58
+ # This will show the type and the content (e.g., <class 'tuple'> : ({"ADAPT": 0.9},))
59
+ return f"TYPE: {type(raw_result)}\n\nCONTENT: {raw_result}"
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  except Exception as e:
62
+ return f"API ERROR: {str(e)}"
63
 
64
+ # 5. UI DESIGN
65
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
66
+ gr.Markdown("# 🧠 Diagnostic Mode: OmniSign VLM")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  with gr.Row():
69
  with gr.Column():
70
+ video_comp = gr.Video(label="Input Buffer", autoplay=True)
71
+ dataset_drop = gr.Dropdown(choices=[""] + sorted(list(dataset_options.keys())), label="Select Sample")
72
+ run_btn = gr.Button("Analyze Raw Response", variant="primary")
 
 
 
 
 
 
 
 
 
73
 
74
  with gr.Column():
75
+ # CHANGED: Using Textbox instead of Label to see the raw data structure
76
+ raw_output_box = gr.Textbox(label="Raw Data Received from Private Space", lines=10)
 
 
 
 
77
 
78
+ dataset_drop.change(fn=update_video_display, inputs=dataset_drop, outputs=video_comp)
79
+ run_btn.click(fn=run_omnisign_vlm, inputs=video_comp, outputs=raw_output_box)
 
80
 
81
  if __name__ == "__main__":
82
  demo.launch(ssr_mode=False)