Spaces:

RohitMugalya
/

zero-shot-video-classifier

Sleeping

zero-shot-video-classifier / src /app.py

Rohit Mugalya

updated the interface with all three models

91ea5a9 25 days ago

5.54 kB

	import warnings
	warnings.filterwarnings("ignore")

	import sys
	from pathlib import Path
	sys.path.append(str(Path(__file__).parent))

	import streamlit as st
	import tempfile
	import os
	import time

	from classifier import ZeroShotVideoClassifier, MODELS
	from frame_extractor import extract_frames

	st.set_page_config(
	page_title="Zero-Shot Video Classifier",
	page_icon="🎬",
	layout="wide",
	initial_sidebar_state="expanded"
	)


	@st.cache_resource(show_spinner=False)
	def load_classifier(model_key: str) -> ZeroShotVideoClassifier:
	return ZeroShotVideoClassifier(model_key=model_key)


	with st.sidebar:
	st.title("🎬 VideoClassify")
	st.caption("Zero-shot video understanding via vision-language models")
	st.divider()

	model_key = st.selectbox("Model", options=list(MODELS.keys()), index=0)

	model_info = {
	"CLIP ViT-B/32": "338MB · UCF-101 Top-1: 58.22% · Pure zero-shot",
	"SigLIP 2 Base": "350MB · UCF-101 Top-1: 70.79% · Pure zero-shot",
	"X-CLIP Base": "780MB · UCF-101 Top-1: 72.44% · Kinetics pretrained",
	}
	st.caption(model_info[model_key])

	st.divider()

	uploaded = st.file_uploader("Upload Video", type=["mp4", "avi", "mov", "mkv"])
	num_frames = st.slider("Frames to sample", min_value=4, max_value=16, value=8)


	st.title("Zero-Shot Video Classifier")
	st.caption("Classify any video using natural language — no task-specific training required")
	st.divider()

	left_col, right_col = st.columns([1.1, 0.9], gap="large")

	with left_col:
	st.subheader("Classification")

	labels_input = st.text_area(
	"Labels (one per line)",
	value="playing basketball\nswimming\ncooking food\nriding a bike\ndoing archery\nplaying guitar\nweightlifting\ndancing",
	height=200
	)

	if not uploaded:
	st.info("Upload a video in the sidebar to get started.")
	st.markdown("")
	run = st.button("▶ Run Classification", use_container_width=True, type="primary", disabled=True)
	else:
	run = st.button("▶ Run Classification", use_container_width=True, type="primary")

	if uploaded and run:
	labels = [l.strip() for l in labels_input.splitlines() if l.strip()]

	if len(labels) < 2:
	st.error("Enter at least 2 labels.")
	else:
	with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded.name).suffix) as tmp:
	tmp.write(uploaded.read())
	tmp_path = tmp.name

	with st.spinner(f"Loading {model_key}..."):
	classifier = load_classifier(model_key)

	with st.spinner("Extracting frames..."):
	frames = extract_frames(tmp_path, num_frames=num_frames)

	if not frames:
	st.error("Could not read frames from this video.")
	else:
	with st.spinner("Classifying..."):
	t0 = time.time()
	predictions = classifier.classify(frames, labels, top_k=len(labels))
	elapsed = time.time() - t0

	os.unlink(tmp_path)

	st.divider()

	m1, m2, m3 = st.columns(3)
	m1.metric("Top Prediction", predictions[0]["label"].title())
	m2.metric("Confidence", f"{predictions[0]['score']*100:.1f}%")
	m3.metric("Inference Time", f"{elapsed:.1f}s")

	st.divider()
	st.subheader("Predictions")

	for i, r in enumerate(predictions):
	rank = "🥇" if i == 0 else f"#{i+1}"
	st.progress(
	min(r["score"] * 2, 1.0),
	text=f"{rank} {r['label'].capitalize()} — {r['score']*100:.2f}%"
	)

	st.divider()
	st.subheader("Sampled Frames")
	cols = st.columns(len(frames))
	for col, frame in zip(cols, frames):
	with col:
	st.image(frame, use_container_width=True)

	with right_col:
	st.subheader("Benchmark — UCF-101")
	st.caption("10 videos per class · 101 classes · 1010 videos total")

	st.dataframe(
	data={
	"Model": ["CLIP ViT-B/32", "SigLIP 2 Base", "X-CLIP Base"],
	"Size": ["338MB", "350MB", "780MB"],
	"Top-1": ["58.22%", "70.79%", "72.44%"],
	"Top-5": ["85.35%", "93.27%", "91.24%"],
	"Type": ["Pure zero-shot", "Pure zero-shot", "Kinetics pretrained"],
	},
	use_container_width=True,
	hide_index=True,
	)

	st.divider()
	st.subheader("Model Notes")

	with st.expander("CLIP ViT-B/32", expanded=True):
	st.write("Contrastive softmax loss. Frames encoded independently then averaged into a single video embedding. Genuine zero-shot — no video-specific pretraining.")

	with st.expander("SigLIP 2 Base", expanded=True):
	st.write("Sigmoid loss objective instead of softmax. Scores per label are independent, making confidence values more calibrated. Best efficiency-to-accuracy ratio of the three.")

	with st.expander("X-CLIP Base", expanded=True):
	st.write("Video-native architecture with cross-frame attention — frames attend to each other before producing the final embedding. Pretrained on Kinetics-400 which has class overlap with UCF-101, so its score is not a clean zero-shot result.")
	st.warning("Kinetics-400 / UCF-101 class overlap — results may be inflated.", icon="⚠️")