Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from streamlit_webrtc import VideoTransformerBase, webrtc_streamer, RTCConfiguration | |
| from transformers import pipeline | |
| from PIL import Image | |
| import cv2 | |
| import numpy as np | |
| import time | |
| # Load TinyLLaVA pipeline once | |
| pipe = pipeline( | |
| task="image-to-text", | |
| model="bczhou/tiny-llava-v1-hf", | |
| trust_remote_code=True, | |
| device_map="cpu" | |
| ) | |
| st.set_page_config(page_title="TinyLLaVA Webcam", layout="centered") | |
| st.title("π¦ TinyLLaVA β Webcam Captioning") | |
| # Shared state | |
| st_frame = st.empty() | |
| result_box = st.empty() | |
| class VideoProcessor(VideoTransformerBase): | |
| def __init__(self): | |
| self.last_run = 0 | |
| self.interval = 5 # seconds | |
| self.last_caption = "" | |
| def transform(self, frame): | |
| img = frame.to_ndarray(format="bgr24") | |
| now = time.time() | |
| if now - self.last_run > self.interval: | |
| self.last_run = now | |
| # Convert BGR to RGB | |
| img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |
| pil_image = Image.fromarray(img_rgb) | |
| # Run TinyLLaVA pipeline | |
| prompt = "Describe this scene in detail." | |
| query = f"USER: <image>\n{prompt}\nASSISTANT:" | |
| with st.spinner("TinyLLaVA is thinking..."): | |
| result = pipe(query, pil_image) | |
| self.last_caption = result[0]["generated_text"] | |
| # Return the same frame, unmodified | |
| return img | |
| # RTC config | |
| rtc_config = RTCConfiguration( | |
| {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]} | |
| ) | |
| webrtc_ctx = webrtc_streamer( | |
| key="example", | |
| video_processor_factory=VideoProcessor, | |
| rtc_configuration=rtc_config, | |
| media_stream_constraints={"video": True, "audio": False} | |
| ) | |
| if webrtc_ctx.video_processor: | |
| st.info("Keep your webcam on. The app captures 1 frame every 5 seconds and generates a caption.") | |
| st.write("Latest Caption:") | |
| st.write(webrtc_ctx.video_processor.last_caption) | |