Spaces:

st192011
/

ASL-VLM-Protocol

Sleeping

App Files Files Community

ASL-VLM-Protocol / app.py

st192011

Update app.py

01ab6f8 verified 7 days ago

raw

history blame contribute delete

5.35 kB

	import os
	import shutil
	import gradio as gr
	from gradio_client import Client, handle_file
	from huggingface_hub import hf_hub_download

	# 1. SECRETS & BACKEND LINK
	HF_TOKEN = os.environ.get("HF_TOKEN")
	PRIVATE_SPACE = "st192011/ASL-VLS-Private"

	# 2. DEFINITIVE SUPPORTED VOCABULARY LIST
	SUPPORTED_VIDEOS = [
	("00944", "ADAPT"), ("00963", "ADD"), ("01064", "ADJECTIVE"), ("00335", "ABDOMEN"),
	("00689", "ACCOUNTANT"), ("00899", "ACTOR"), ("00584", "ACCENT"), ("00632", "ACCIDENT"),
	("00586", "ACCENT"), ("00585", "ACCENT"), ("00626", "ACCIDENT"), ("00623", "ACCIDENT"),
	("00846", "ACT"), ("00890", "ACTIVITY"), ("00898", "ACTOR"), ("01011", "ADDRESS"),
	("00834", "ACROSS"), ("00624", "ACCIDENT"), ("00593", "ACCEPT"), ("00415", "ABOUT"),
	("00961", "ADD"), ("00962", "ADD"), ("00594", "ACCEPT"), ("00964", "ADD"),
	("00666", "ACCOMPLISH"), ("01065", "ADJECTIVE"), ("00628", "ACCIDENT"), ("00868", "ACTIVE"),
	("00836", "ACROSS"), ("00430", "ABOVE"), ("00835", "ACROSS"), ("00946", "ADAPT"),
	("00943", "ADAPT"), ("00414", "ABOUT"), ("00376", "ABLE"), ("00832", "ACROSS"),
	("00627", "ACCIDENT"), ("00592", "ACCEPT"), ("00625", "ACCIDENT"), ("01012", "ADDRESS"),
	("00849", "ACT"), ("00663", "ACCOMPLISH"), ("00853", "ACTION"), ("00967", "ADD"),
	("00692", "ACCOUNTANT"), ("00583", "ACCENT"), ("00341", "ACROSS"), ("00378", "ADDRESS"),
	("00433", "ADJECTIVE"), ("00384", "ACTOR"), ("00381", "ACTOR"), ("00377", "ACCIDENT"),
	("00382", "ACTOR"), ("00378", "ADDRESS")
	]
	SUPPORTED_GLOSSES_UNIQUE = sorted(list(set([g for _, g in SUPPORTED_VIDEOS])))
	dataset_options = {f"{g} (Sample {vid})": f"data/data_0/{vid.zfill(5)}.mp4" for vid, g in SUPPORTED_VIDEOS}

	# 3. INITIALIZE CLIENT
	try:
	client = Client(PRIVATE_SPACE, token=HF_TOKEN)
	except Exception as e:
	client = None

	# 4. LOGIC FUNCTIONS
	def update_video_display(selection):
	if not selection: return None, None
	try:
	gloss_gt = selection.split('(')[0].strip()
	hf_path = dataset_options[selection]
	cache_path = hf_hub_download(repo_id="Voxel51/WLASL", filename=hf_path, repo_type="dataset")
	local_path = os.path.join("/tmp", os.path.basename(hf_path))
	shutil.copy(cache_path, local_path)
	return local_path, f"Ground Truth: {gloss_gt}"
	except:
	return None, "Error loading sample"

	def run_omnisign_vlm(video_path):
	if not video_path: return {"⚠️ No Input": 0.0}
	if not client: return {"⚠️ Engine Offline": 0.0}

	try:
	# Request from Private Space
	result = client.predict(
	handle_file(video_path),
	api_name="/predict_sign"
	)

	# --- PARSING LOGIC ---
	if isinstance(result, dict) and "confidences" in result:
	return {item['label']: item['confidence'] for item in result["confidences"]}

	return result

	except Exception as e:
	return {f"❌ Neural Analysis Error": 0.0}

	# 5. UI DESIGN (PROFESSIONAL PITCH)
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown(f"""
	# 🧠 OmniSign VLM: Neural Universal SL Protocol
	### Powered by Multimodal Temporal Reasoning

	This demonstration showcases a revolutionary Structural Protocol for sign language interpretation, powered by a Large Vision-Language Model (VLM) core. Our protocol focuses on extracting pure kinetic semantics from video streams.

	The OmniSign Protocol's Unique Advantages:

	1. Motion-Oriented Core: The system is designed to analyze the physics and trajectory of movement, rendering the prediction robust against variations in the signer, lighting, or background environment.
	2. Lexical Agnosticism: The underlying VLM protocol is language-independent. It can be instantly updated to recognize new signs or expanded to include any sign language (e.g., ASL, BSL, LSF) with unparalleled efficiency.
	3. Future-Proof Scalability: New vocabulary can be integrated into the system's lexicon instantly, bypassing traditional, time-consuming retraining cycles.

	---
	This engine is currently not yet fully optimized for predictive accuracy and operates on a limited vocabulary. Its sole purpose is to demonstrate a highly versatile and scalable VLM protocol.
	""")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 🎦 1. Input Source")
	video_comp = gr.Video(label="Video Buffer", autoplay=True)
	dataset_drop = gr.Dropdown(choices=[""] + sorted(list(dataset_options.keys())), label="Browse WLASL Archive")
	gt_output = gr.Textbox(label="Ground Truth Reference", interactive=False)
	run_btn = gr.Button("🚀 Execute Neural Analysis", variant="primary")

	with gr.Column():
	gr.Markdown("### 📊 2. VLM Perception Result")
	output_label = gr.Label(num_top_classes=3, label="Confidence Score")

	with gr.Accordion("🔍 Supported Vocabulary List", open=True):
	gr.Markdown(", ".join(SUPPORTED_GLOSSES_UNIQUE))

	dataset_drop.change(fn=update_video_display, inputs=dataset_drop, outputs=[video_comp, gt_output])
	run_btn.click(fn=run_omnisign_vlm, inputs=video_comp, outputs=output_label)

	if __name__ == "__main__":
	demo.launch(ssr_mode=False)