Rohit Mugalya
updated the interface with all three models
91ea5a9
import warnings
warnings.filterwarnings("ignore")
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent))
import streamlit as st
import tempfile
import os
import time
from classifier import ZeroShotVideoClassifier, MODELS
from frame_extractor import extract_frames
st.set_page_config(
page_title="Zero-Shot Video Classifier",
page_icon="🎬",
layout="wide",
initial_sidebar_state="expanded"
)
@st.cache_resource(show_spinner=False)
def load_classifier(model_key: str) -> ZeroShotVideoClassifier:
return ZeroShotVideoClassifier(model_key=model_key)
with st.sidebar:
st.title("🎬 VideoClassify")
st.caption("Zero-shot video understanding via vision-language models")
st.divider()
model_key = st.selectbox("Model", options=list(MODELS.keys()), index=0)
model_info = {
"CLIP ViT-B/32": "338MB · UCF-101 Top-1: 58.22% · Pure zero-shot",
"SigLIP 2 Base": "350MB · UCF-101 Top-1: 70.79% · Pure zero-shot",
"X-CLIP Base": "780MB · UCF-101 Top-1: 72.44% · Kinetics pretrained",
}
st.caption(model_info[model_key])
st.divider()
uploaded = st.file_uploader("Upload Video", type=["mp4", "avi", "mov", "mkv"])
num_frames = st.slider("Frames to sample", min_value=4, max_value=16, value=8)
st.title("Zero-Shot Video Classifier")
st.caption("Classify any video using natural language — no task-specific training required")
st.divider()
left_col, right_col = st.columns([1.1, 0.9], gap="large")
with left_col:
st.subheader("Classification")
labels_input = st.text_area(
"Labels (one per line)",
value="playing basketball\nswimming\ncooking food\nriding a bike\ndoing archery\nplaying guitar\nweightlifting\ndancing",
height=200
)
if not uploaded:
st.info("Upload a video in the sidebar to get started.")
st.markdown("")
run = st.button("▶ Run Classification", use_container_width=True, type="primary", disabled=True)
else:
run = st.button("▶ Run Classification", use_container_width=True, type="primary")
if uploaded and run:
labels = [l.strip() for l in labels_input.splitlines() if l.strip()]
if len(labels) < 2:
st.error("Enter at least 2 labels.")
else:
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded.name).suffix) as tmp:
tmp.write(uploaded.read())
tmp_path = tmp.name
with st.spinner(f"Loading {model_key}..."):
classifier = load_classifier(model_key)
with st.spinner("Extracting frames..."):
frames = extract_frames(tmp_path, num_frames=num_frames)
if not frames:
st.error("Could not read frames from this video.")
else:
with st.spinner("Classifying..."):
t0 = time.time()
predictions = classifier.classify(frames, labels, top_k=len(labels))
elapsed = time.time() - t0
os.unlink(tmp_path)
st.divider()
m1, m2, m3 = st.columns(3)
m1.metric("Top Prediction", predictions[0]["label"].title())
m2.metric("Confidence", f"{predictions[0]['score']*100:.1f}%")
m3.metric("Inference Time", f"{elapsed:.1f}s")
st.divider()
st.subheader("Predictions")
for i, r in enumerate(predictions):
rank = "🥇" if i == 0 else f"#{i+1}"
st.progress(
min(r["score"] * 2, 1.0),
text=f"{rank} {r['label'].capitalize()}{r['score']*100:.2f}%"
)
st.divider()
st.subheader("Sampled Frames")
cols = st.columns(len(frames))
for col, frame in zip(cols, frames):
with col:
st.image(frame, use_container_width=True)
with right_col:
st.subheader("Benchmark — UCF-101")
st.caption("10 videos per class · 101 classes · 1010 videos total")
st.dataframe(
data={
"Model": ["CLIP ViT-B/32", "SigLIP 2 Base", "X-CLIP Base"],
"Size": ["338MB", "350MB", "780MB"],
"Top-1": ["58.22%", "70.79%", "72.44%"],
"Top-5": ["85.35%", "93.27%", "91.24%"],
"Type": ["Pure zero-shot", "Pure zero-shot", "Kinetics pretrained"],
},
use_container_width=True,
hide_index=True,
)
st.divider()
st.subheader("Model Notes")
with st.expander("CLIP ViT-B/32", expanded=True):
st.write("Contrastive softmax loss. Frames encoded independently then averaged into a single video embedding. Genuine zero-shot — no video-specific pretraining.")
with st.expander("SigLIP 2 Base", expanded=True):
st.write("Sigmoid loss objective instead of softmax. Scores per label are independent, making confidence values more calibrated. Best efficiency-to-accuracy ratio of the three.")
with st.expander("X-CLIP Base", expanded=True):
st.write("Video-native architecture with cross-frame attention — frames attend to each other before producing the final embedding. Pretrained on Kinetics-400 which has class overlap with UCF-101, so its score is not a clean zero-shot result.")
st.warning("Kinetics-400 / UCF-101 class overlap — results may be inflated.", icon="⚠️")