import streamlit as st import tensorflow as tf import numpy as np import cv2 import tempfile, os, time from ultralytics import YOLO from huggingface_hub import hf_hub_download # ── Page config ─────────────────────────────────────────────── st.set_page_config( page_title="ShopGuard AI", page_icon="🛡️", layout="wide" ) st.markdown(""" """, unsafe_allow_html=True) # ── Config ──────────────────────────────────────────────────── FRAMES_PER_VIDEO = 16 IMG_SIZE = 224 PERSON_CLASS = 0 YOLO_CONF = 0.3 PAD = 0.10 MODEL_CONFIGS = { "Model A — General": { "repo_id": "higsboson/shoplifting_exp_a", "filename": "shoplifting_a.keras", "default_threshold": 0.50, "label": "A" }, "Model B — Kitchen": { "repo_id": "higsboson/shoplifting_exp_b", "filename": "best_model.keras", "default_threshold": 0.50, "label": "B" }, "Model C — Lab": { "repo_id": "higsboson/shoplifting_exp_c", "filename": "shoplifting_exp_c.keras", "default_threshold": 0.50, "label": "C" }, } # ── Loaders ─────────────────────────────────────────────────── @st.cache_resource def load_yolo(): return YOLO("yolo11n.pt") @st.cache_resource def load_mobilenet(): base = tf.keras.applications.MobileNetV2( input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False, pooling="avg", weights="imagenet" ) base.trainable = False return base @st.cache_resource def load_lstm(repo_id, filename): path = hf_hub_download(repo_id=repo_id, filename=filename) return tf.keras.models.load_model(path) # ── Pipeline ────────────────────────────────────────────────── def extract_frames(video_path, n=FRAMES_PER_VIDEO): cap = cv2.VideoCapture(video_path) total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) idxs = np.linspace(0, max(total - 1, 0), n, dtype=int) frames = {} for idx in idxs: cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx)) ret, frame = cap.read() if ret: frames[idx] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) cap.release() return frames, idxs def crop_person(frame, yolo_model, last_box): h, w = frame.shape[:2] results = yolo_model(frame, conf=YOLO_CONF, classes=[PERSON_CLASS], verbose=False) boxes = results[0].boxes if boxes is not None and len(boxes): best = max(boxes, key=lambda b: b.conf.item()) x1, y1, x2, y2 = map(int, best.xyxy[0].tolist()) pw, ph = x2 - x1, y2 - y1 x1 = max(0, x1 - int(PAD * pw)) y1 = max(0, y1 - int(PAD * ph)) x2 = min(w, x2 + int(PAD * pw)) y2 = min(h, y2 + int(PAD * ph)) last_box[0] = (x1, y1, x2, y2) if last_box[0]: x1, y1, x2, y2 = last_box[0] crop = frame[y1:y2, x1:x2] else: crop = frame return cv2.resize(crop, (IMG_SIZE, IMG_SIZE)) def run_inference(video_path, yolo_model, mobilenet, lstm_model, threshold): frames_dict, idxs = extract_frames(video_path) last_box = [None] crops = [] for idx in idxs: frame = frames_dict.get(idx, np.zeros((IMG_SIZE, IMG_SIZE, 3), np.uint8)) crops.append(crop_person(frame, yolo_model, last_box)) crops_arr = np.array(crops, dtype=np.float32) crops_pp = tf.keras.applications.mobilenet_v2.preprocess_input(crops_arr) features = mobilenet.predict(crops_pp, verbose=0) features = features[np.newaxis, ...] prob = lstm_model.predict(features, verbose=0)[0][0] label = "SHOPLIFTING" if prob >= threshold else "NORMAL" return float(prob), label, crops # ── Header ──────────────────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ── Layout: Left config | Right result ─────────────────────── col_left, col_right = st.columns([1, 1.6], gap="large") with col_left: st.markdown('

⚙ Model Configuration

', unsafe_allow_html=True) model_choice = st.selectbox( "Select Model", list(MODEL_CONFIGS.keys()), help="Choose which trained model to run inference with" ) cfg = MODEL_CONFIGS[model_choice] st.markdown(f'

HF: {cfg["repo_id"]}

', unsafe_allow_html=True) threshold = st.slider( "Decision Threshold", min_value=0.0, max_value=1.0, value=cfg["default_threshold"], step=0.01, help="Probability above this = Shoplifting. Adjust per your validation results." ) st.caption(f"ℹ️ Prob ≥ {threshold:.2f} → 🚨 Shoplifting | Prob < {threshold:.2f} → ✅ Normal") st.divider() st.markdown('

📹 Video Input

', unsafe_allow_html=True) uploaded = st.file_uploader( "Upload Video", type=["mp4", "avi", "mov", "mkv"], help="Short clips (5–30s) work best" ) run_btn = st.button("🔍 Run Inference", disabled=(uploaded is None)) with col_right: if uploaded is None: st.markdown("""

            Upload a video on the left
and click Run Inference
        

""", unsafe_allow_html=True) else: with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp: tmp.write(uploaded.read()) tmp_path = tmp.name st.video(tmp_path) if run_btn: with st.spinner("Loading YOLO + MobileNetV2..."): yolo = load_yolo() mobilenet = load_mobilenet() with st.spinner(f"Downloading model from HuggingFace ({cfg['repo_id']})..."): lstm = load_lstm(cfg["repo_id"], cfg["filename"]) with st.spinner("Running pipeline: frame extraction → YOLO crop → feature extraction → LSTM..."): t0 = time.time() prob, label, crops = run_inference(tmp_path, yolo, mobilenet, lstm, threshold) elapsed = time.time() - t0 os.unlink(tmp_path) st.divider() # Result card is_shop = label == "SHOPLIFTING" card_cls = "result-shoplifting" if is_shop else "result-normal" lbl_cls = "result-label-shop" if is_shop else "result-label-norm" icon = "🚨" if is_shop else "✅" bar_color = "#f85149" if is_shop else "#3fb950" st.markdown(f"""

{icon} {label}

Confidence: {prob:.4f}

Model {cfg['label']} | Threshold: {threshold:.2f} | Inference: {elapsed:.2f}s

                        {prob*100:.1f}%
                    

""", unsafe_allow_html=True) # Metrics row st.divider() m1, m2, m3 = st.columns(3) m1.metric("Probability", f"{prob:.4f}") m2.metric("Threshold", f"{threshold:.2f}") m3.metric("Inference", f"{elapsed:.2f}s") # Sampled crops st.markdown('

🎞 YOLO-Cropped Frames

', unsafe_allow_html=True) cols = st.columns(8) for i, crop in enumerate(crops[:8]): cols[i].image(crop, use_container_width=True, caption=f"f{i+1}")