import warnings
warnings.filterwarnings("ignore")

import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent))

import streamlit as st
import tempfile
import os
import time

from classifier import ZeroShotVideoClassifier, MODELS
from frame_extractor import extract_frames

st.set_page_config(
    page_title="Zero-Shot Video Classifier",
    page_icon="🎬",
    layout="wide",
    initial_sidebar_state="expanded"
)


@st.cache_resource(show_spinner=False)
def load_classifier(model_key: str) -> ZeroShotVideoClassifier:
    return ZeroShotVideoClassifier(model_key=model_key)


with st.sidebar:
    st.title("🎬 VideoClassify")
    st.caption("Zero-shot video understanding via vision-language models")
    st.divider()

    model_key = st.selectbox("Model", options=list(MODELS.keys()), index=0)

    model_info = {
        "CLIP ViT-B/32": "338MB · UCF-101 Top-1: 58.22% · Pure zero-shot",
        "SigLIP 2 Base": "350MB · UCF-101 Top-1: 70.79% · Pure zero-shot",
        "X-CLIP Base":   "780MB · UCF-101 Top-1: 72.44% · Kinetics pretrained",
    }
    st.caption(model_info[model_key])

    st.divider()

    uploaded = st.file_uploader("Upload Video", type=["mp4", "avi", "mov", "mkv"])
    num_frames = st.slider("Frames to sample", min_value=4, max_value=16, value=8)


st.title("Zero-Shot Video Classifier")
st.caption("Classify any video using natural language — no task-specific training required")
st.divider()

left_col, right_col = st.columns([1.1, 0.9], gap="large")

with left_col:
    st.subheader("Classification")

    labels_input = st.text_area(
        "Labels (one per line)",
        value="playing basketball\nswimming\ncooking food\nriding a bike\ndoing archery\nplaying guitar\nweightlifting\ndancing",
        height=200
    )

    if not uploaded:
        st.info("Upload a video in the sidebar to get started.")
        st.markdown("")
        run = st.button("▶  Run Classification", use_container_width=True, type="primary", disabled=True)
    else:
        run = st.button("▶  Run Classification", use_container_width=True, type="primary")

    if uploaded and run:
        labels = [l.strip() for l in labels_input.splitlines() if l.strip()]

        if len(labels) < 2:
            st.error("Enter at least 2 labels.")
        else:
            with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded.name).suffix) as tmp:
                tmp.write(uploaded.read())
                tmp_path = tmp.name

            with st.spinner(f"Loading {model_key}..."):
                classifier = load_classifier(model_key)

            with st.spinner("Extracting frames..."):
                frames = extract_frames(tmp_path, num_frames=num_frames)

            if not frames:
                st.error("Could not read frames from this video.")
            else:
                with st.spinner("Classifying..."):
                    t0 = time.time()
                    predictions = classifier.classify(frames, labels, top_k=len(labels))
                    elapsed = time.time() - t0

                os.unlink(tmp_path)

                st.divider()

                m1, m2, m3 = st.columns(3)
                m1.metric("Top Prediction", predictions[0]["label"].title())
                m2.metric("Confidence", f"{predictions[0]['score']*100:.1f}%")
                m3.metric("Inference Time", f"{elapsed:.1f}s")

                st.divider()
                st.subheader("Predictions")

                for i, r in enumerate(predictions):
                    rank = "🥇" if i == 0 else f"#{i+1}"
                    st.progress(
                        min(r["score"] * 2, 1.0),
                        text=f"{rank}  {r['label'].capitalize()}  —  {r['score']*100:.2f}%"
                    )

                st.divider()
                st.subheader("Sampled Frames")
                cols = st.columns(len(frames))
                for col, frame in zip(cols, frames):
                    with col:
                        st.image(frame, use_container_width=True)

with right_col:
    st.subheader("Benchmark — UCF-101")
    st.caption("10 videos per class · 101 classes · 1010 videos total")

    st.dataframe(
        data={
            "Model":  ["CLIP ViT-B/32", "SigLIP 2 Base", "X-CLIP Base"],
            "Size":   ["338MB", "350MB", "780MB"],
            "Top-1":  ["58.22%", "70.79%", "72.44%"],
            "Top-5":  ["85.35%", "93.27%", "91.24%"],
            "Type":   ["Pure zero-shot", "Pure zero-shot", "Kinetics pretrained"],
        },
        use_container_width=True,
        hide_index=True,
    )

    st.divider()
    st.subheader("Model Notes")

    with st.expander("CLIP ViT-B/32", expanded=True):
        st.write("Contrastive softmax loss. Frames encoded independently then averaged into a single video embedding. Genuine zero-shot — no video-specific pretraining.")

    with st.expander("SigLIP 2 Base", expanded=True):
        st.write("Sigmoid loss objective instead of softmax. Scores per label are independent, making confidence values more calibrated. Best efficiency-to-accuracy ratio of the three.")

    with st.expander("X-CLIP Base", expanded=True):
        st.write("Video-native architecture with cross-frame attention — frames attend to each other before producing the final embedding. Pretrained on Kinetics-400 which has class overlap with UCF-101, so its score is not a clean zero-shot result.")
        st.warning("Kinetics-400 / UCF-101 class overlap — results may be inflated.", icon="⚠️")