| import warnings |
| warnings.filterwarnings("ignore") |
|
|
| import sys |
| from pathlib import Path |
| sys.path.append(str(Path(__file__).parent)) |
|
|
| import streamlit as st |
| import tempfile |
| import os |
| import time |
|
|
| from classifier import ZeroShotVideoClassifier, MODELS |
| from frame_extractor import extract_frames |
|
|
| st.set_page_config( |
| page_title="Zero-Shot Video Classifier", |
| page_icon="🎬", |
| layout="wide", |
| initial_sidebar_state="expanded" |
| ) |
|
|
|
|
| @st.cache_resource(show_spinner=False) |
| def load_classifier(model_key: str) -> ZeroShotVideoClassifier: |
| return ZeroShotVideoClassifier(model_key=model_key) |
|
|
|
|
| with st.sidebar: |
| st.title("🎬 VideoClassify") |
| st.caption("Zero-shot video understanding via vision-language models") |
| st.divider() |
|
|
| model_key = st.selectbox("Model", options=list(MODELS.keys()), index=0) |
|
|
| model_info = { |
| "CLIP ViT-B/32": "338MB · UCF-101 Top-1: 58.22% · Pure zero-shot", |
| "SigLIP 2 Base": "350MB · UCF-101 Top-1: 70.79% · Pure zero-shot", |
| "X-CLIP Base": "780MB · UCF-101 Top-1: 72.44% · Kinetics pretrained", |
| } |
| st.caption(model_info[model_key]) |
|
|
| st.divider() |
|
|
| uploaded = st.file_uploader("Upload Video", type=["mp4", "avi", "mov", "mkv"]) |
| num_frames = st.slider("Frames to sample", min_value=4, max_value=16, value=8) |
|
|
|
|
| st.title("Zero-Shot Video Classifier") |
| st.caption("Classify any video using natural language — no task-specific training required") |
| st.divider() |
|
|
| left_col, right_col = st.columns([1.1, 0.9], gap="large") |
|
|
| with left_col: |
| st.subheader("Classification") |
|
|
| labels_input = st.text_area( |
| "Labels (one per line)", |
| value="playing basketball\nswimming\ncooking food\nriding a bike\ndoing archery\nplaying guitar\nweightlifting\ndancing", |
| height=200 |
| ) |
|
|
| if not uploaded: |
| st.info("Upload a video in the sidebar to get started.") |
| st.markdown("") |
| run = st.button("▶ Run Classification", use_container_width=True, type="primary", disabled=True) |
| else: |
| run = st.button("▶ Run Classification", use_container_width=True, type="primary") |
|
|
| if uploaded and run: |
| labels = [l.strip() for l in labels_input.splitlines() if l.strip()] |
|
|
| if len(labels) < 2: |
| st.error("Enter at least 2 labels.") |
| else: |
| with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded.name).suffix) as tmp: |
| tmp.write(uploaded.read()) |
| tmp_path = tmp.name |
|
|
| with st.spinner(f"Loading {model_key}..."): |
| classifier = load_classifier(model_key) |
|
|
| with st.spinner("Extracting frames..."): |
| frames = extract_frames(tmp_path, num_frames=num_frames) |
|
|
| if not frames: |
| st.error("Could not read frames from this video.") |
| else: |
| with st.spinner("Classifying..."): |
| t0 = time.time() |
| predictions = classifier.classify(frames, labels, top_k=len(labels)) |
| elapsed = time.time() - t0 |
|
|
| os.unlink(tmp_path) |
|
|
| st.divider() |
|
|
| m1, m2, m3 = st.columns(3) |
| m1.metric("Top Prediction", predictions[0]["label"].title()) |
| m2.metric("Confidence", f"{predictions[0]['score']*100:.1f}%") |
| m3.metric("Inference Time", f"{elapsed:.1f}s") |
|
|
| st.divider() |
| st.subheader("Predictions") |
|
|
| for i, r in enumerate(predictions): |
| rank = "🥇" if i == 0 else f"#{i+1}" |
| st.progress( |
| min(r["score"] * 2, 1.0), |
| text=f"{rank} {r['label'].capitalize()} — {r['score']*100:.2f}%" |
| ) |
|
|
| st.divider() |
| st.subheader("Sampled Frames") |
| cols = st.columns(len(frames)) |
| for col, frame in zip(cols, frames): |
| with col: |
| st.image(frame, use_container_width=True) |
|
|
| with right_col: |
| st.subheader("Benchmark — UCF-101") |
| st.caption("10 videos per class · 101 classes · 1010 videos total") |
|
|
| st.dataframe( |
| data={ |
| "Model": ["CLIP ViT-B/32", "SigLIP 2 Base", "X-CLIP Base"], |
| "Size": ["338MB", "350MB", "780MB"], |
| "Top-1": ["58.22%", "70.79%", "72.44%"], |
| "Top-5": ["85.35%", "93.27%", "91.24%"], |
| "Type": ["Pure zero-shot", "Pure zero-shot", "Kinetics pretrained"], |
| }, |
| use_container_width=True, |
| hide_index=True, |
| ) |
|
|
| st.divider() |
| st.subheader("Model Notes") |
|
|
| with st.expander("CLIP ViT-B/32", expanded=True): |
| st.write("Contrastive softmax loss. Frames encoded independently then averaged into a single video embedding. Genuine zero-shot — no video-specific pretraining.") |
|
|
| with st.expander("SigLIP 2 Base", expanded=True): |
| st.write("Sigmoid loss objective instead of softmax. Scores per label are independent, making confidence values more calibrated. Best efficiency-to-accuracy ratio of the three.") |
|
|
| with st.expander("X-CLIP Base", expanded=True): |
| st.write("Video-native architecture with cross-frame attention — frames attend to each other before producing the final embedding. Pretrained on Kinetics-400 which has class overlap with UCF-101, so its score is not a clean zero-shot result.") |
| st.warning("Kinetics-400 / UCF-101 class overlap — results may be inflated.", icon="⚠️") |
|
|