Spaces:

WaysAheadGlobal
/

VLM

Sleeping

VLM

File size: 1,949 Bytes

import streamlit as st
from streamlit_webrtc import VideoTransformerBase, webrtc_streamer, RTCConfiguration
from transformers import pipeline
from PIL import Image
import cv2
import numpy as np
import time

# Load TinyLLaVA pipeline once
pipe = pipeline(
    task="image-to-text",
    model="bczhou/tiny-llava-v1-hf",
    trust_remote_code=True,
    device_map="cpu"
)

st.set_page_config(page_title="TinyLLaVA Webcam", layout="centered")
st.title("🦙 TinyLLaVA — Webcam Captioning")

# Shared state
st_frame = st.empty()
result_box = st.empty()

class VideoProcessor(VideoTransformerBase):
    def __init__(self):
        self.last_run = 0
        self.interval = 5  # seconds
        self.last_caption = ""

    def transform(self, frame):
        img = frame.to_ndarray(format="bgr24")

        now = time.time()
        if now - self.last_run > self.interval:
            self.last_run = now

            # Convert BGR to RGB
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(img_rgb)

            # Run TinyLLaVA pipeline
            prompt = "Describe this scene in detail."
            query = f"USER: <image>\n{prompt}\nASSISTANT:"
            with st.spinner("TinyLLaVA is thinking..."):
                result = pipe(query, pil_image)
                self.last_caption = result[0]["generated_text"]

        # Return the same frame, unmodified
        return img

# RTC config
rtc_config = RTCConfiguration(
    {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
)

webrtc_ctx = webrtc_streamer(
    key="example",
    video_processor_factory=VideoProcessor,
    rtc_configuration=rtc_config,
    media_stream_constraints={"video": True, "audio": False}
)

if webrtc_ctx.video_processor:
    st.info("Keep your webcam on. The app captures 1 frame every 5 seconds and generates a caption.")
    st.write("Latest Caption:")
    st.write(webrtc_ctx.video_processor.last_caption)