st192011 commited on
Commit
05e228c
·
verified ·
1 Parent(s): 776ce26

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -21
app.py CHANGED
@@ -21,8 +21,11 @@ SUPPORTED_VIDEOS = [
21
  ("00943", "ADAPT"), ("00414", "ABOUT"), ("00376", "ABLE"), ("00832", "ACROSS"),
22
  ("00627", "ACCIDENT"), ("00592", "ACCEPT"), ("00625", "ACCIDENT"), ("01012", "ADDRESS"),
23
  ("00849", "ACT"), ("00663", "ACCOMPLISH"), ("00853", "ACTION"), ("00967", "ADD"),
24
- ("00692", "ACCOUNTANT"), ("00583", "ACCENT")
 
 
25
  ]
 
26
  dataset_options = {f"{g} (Sample {vid})": f"data/data_0/{vid.zfill(5)}.mp4" for vid, g in SUPPORTED_VIDEOS}
27
 
28
  # 3. INITIALIZE CLIENT
@@ -33,50 +36,74 @@ except Exception as e:
33
 
34
  # 4. LOGIC FUNCTIONS
35
  def update_video_display(selection):
36
- if not selection: return None
37
  try:
 
38
  hf_path = dataset_options[selection]
39
  cache_path = hf_hub_download(repo_id="Voxel51/WLASL", filename=hf_path, repo_type="dataset")
40
  local_path = os.path.join("/tmp", os.path.basename(hf_path))
41
  shutil.copy(cache_path, local_path)
42
- return local_path
43
  except:
44
- return None
45
 
46
  def run_omnisign_vlm(video_path):
47
- if not video_path: return "No Input"
48
- if not client: return "Engine Offline"
49
 
50
  try:
51
- # PURE INQUIRY: Capture the raw response from the Private Space
52
- raw_result = client.predict(
53
  handle_file(video_path),
54
  api_name="/predict_sign"
55
  )
56
 
57
- # RETURN RAW DATA FOR INSPECTION
58
- # This will show the type and the content (e.g., <class 'tuple'> : ({"ADAPT": 0.9},))
59
- return f"TYPE: {type(raw_result)}\n\nCONTENT: {raw_result}"
 
 
 
 
60
 
61
  except Exception as e:
62
- return f"API ERROR: {str(e)}"
63
 
64
- # 5. UI DESIGN
65
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
66
- gr.Markdown("# 🧠 Diagnostic Mode: OmniSign VLM")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  with gr.Row():
69
  with gr.Column():
70
- video_comp = gr.Video(label="Input Buffer", autoplay=True)
71
- dataset_drop = gr.Dropdown(choices=[""] + sorted(list(dataset_options.keys())), label="Select Sample")
72
- run_btn = gr.Button("Analyze Raw Response", variant="primary")
 
 
73
 
74
  with gr.Column():
75
- # CHANGED: Using Textbox instead of Label to see the raw data structure
76
- raw_output_box = gr.Textbox(label="Raw Data Received from Private Space", lines=10)
 
 
 
77
 
78
- dataset_drop.change(fn=update_video_display, inputs=dataset_drop, outputs=video_comp)
79
- run_btn.click(fn=run_omnisign_vlm, inputs=video_comp, outputs=raw_output_box)
80
 
81
  if __name__ == "__main__":
82
  demo.launch(ssr_mode=False)
 
21
  ("00943", "ADAPT"), ("00414", "ABOUT"), ("00376", "ABLE"), ("00832", "ACROSS"),
22
  ("00627", "ACCIDENT"), ("00592", "ACCEPT"), ("00625", "ACCIDENT"), ("01012", "ADDRESS"),
23
  ("00849", "ACT"), ("00663", "ACCOMPLISH"), ("00853", "ACTION"), ("00967", "ADD"),
24
+ ("00692", "ACCOUNTANT"), ("00583", "ACCENT"), ("00341", "ACROSS"), ("00378", "ADDRESS"),
25
+ ("00433", "ADJECTIVE"), ("00384", "ACTOR"), ("00381", "ACTOR"), ("00377", "ACCIDENT"),
26
+ ("00382", "ACTOR"), ("00378", "ADDRESS")
27
  ]
28
+ SUPPORTED_GLOSSES_UNIQUE = sorted(list(set([g for _, g in SUPPORTED_VIDEOS])))
29
  dataset_options = {f"{g} (Sample {vid})": f"data/data_0/{vid.zfill(5)}.mp4" for vid, g in SUPPORTED_VIDEOS}
30
 
31
  # 3. INITIALIZE CLIENT
 
36
 
37
  # 4. LOGIC FUNCTIONS
38
  def update_video_display(selection):
39
+ if not selection: return None, None
40
  try:
41
+ gloss_gt = selection.split('(')[0].strip()
42
  hf_path = dataset_options[selection]
43
  cache_path = hf_hub_download(repo_id="Voxel51/WLASL", filename=hf_path, repo_type="dataset")
44
  local_path = os.path.join("/tmp", os.path.basename(hf_path))
45
  shutil.copy(cache_path, local_path)
46
+ return local_path, f"Ground Truth: {gloss_gt}"
47
  except:
48
+ return None, "Error loading sample"
49
 
50
  def run_omnisign_vlm(video_path):
51
+ if not video_path: return {"⚠️ No Input": 0.0}
52
+ if not client: return {"⚠️ Engine Offline": 0.0}
53
 
54
  try:
55
+ # Request from Private Space
56
+ result = client.predict(
57
  handle_file(video_path),
58
  api_name="/predict_sign"
59
  )
60
 
61
+ # --- PARSING LOGIC ---
62
+ # The Private Space returns: {'label': 'X', 'confidences': [{'label': 'X', 'confidence': 0.9}]}
63
+ # We need to extract the data for the gr.Label component
64
+ if isinstance(result, dict) and "confidences" in result:
65
+ return {item['label']: item['confidence'] for item in result["confidences"]}
66
+
67
+ return result
68
 
69
  except Exception as e:
70
+ return {f" Neural Analysis Error": 0.0}
71
 
72
+ # 5. UI DESIGN (PROFESSIONAL PITCH)
73
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
74
+ gr.Markdown(f"""
75
+ # 🧠 OmniSign VLM: Neural Universal SL Protocol
76
+ ### **Powered by Multimodal Temporal Reasoning**
77
+
78
+ This demonstration showcases a revolutionary **Structural Protocol** for sign language interpretation, powered by a Large Vision-Language Model (VLM) core. Our protocol focuses on extracting pure **kinetic semantics** from video streams.
79
+
80
+ **The OmniSign Protocol's Unique Advantages:**
81
+
82
+ 1. **Motion-Oriented Core:** The system is designed to analyze the physics and trajectory of movement, rendering the prediction robust against variations in the signer, lighting, or background environment.
83
+ 2. **Lexical Agnosticism:** The underlying VLM protocol is language-independent. It can be instantly updated to recognize new signs or expanded to include any sign language (e.g., ASL, BSL, LSF) with unparalleled efficiency.
84
+ 3. **Future-Proof Scalability:** New vocabulary can be integrated into the system's lexicon instantly, bypassing traditional, time-consuming retraining cycles.
85
+
86
+ ---
87
+ *Notice: This is a structural proof-of-concept. The current engine is unoptimized and operates on a limited vocabulary subset to showcase the protocol's power.*
88
+ """)
89
 
90
  with gr.Row():
91
  with gr.Column():
92
+ gr.Markdown("### 🎦 1. Input Source")
93
+ video_comp = gr.Video(label="Video Buffer", autoplay=True)
94
+ dataset_drop = gr.Dropdown(choices=[""] + sorted(list(dataset_options.keys())), label="Browse WLASL Archive")
95
+ gt_output = gr.Textbox(label="Ground Truth Reference", interactive=False)
96
+ run_btn = gr.Button("🚀 Execute Neural Analysis", variant="primary")
97
 
98
  with gr.Column():
99
+ gr.Markdown("### 📊 2. VLM Perception Result")
100
+ output_label = gr.Label(num_top_classes=3, label="Confidence Score")
101
+
102
+ with gr.Accordion("🔍 Supported Vocabulary List", open=True):
103
+ gr.Markdown(", ".join(SUPPORTED_GLOSSES_UNIQUE))
104
 
105
+ dataset_drop.change(fn=update_video_display, inputs=dataset_drop, outputs=[video_comp, gt_output])
106
+ run_btn.click(fn=run_omnisign_vlm, inputs=video_comp, outputs=output_label)
107
 
108
  if __name__ == "__main__":
109
  demo.launch(ssr_mode=False)