Spaces:

WaysAheadGlobal
/

VLM

Sleeping

App Files Files Community

WaysAheadGlobal commited on Jun 28

Commit

106eff3

verified ·

1 Parent(s): 1d2a390

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -45

app.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import streamlit as st
-from streamlit_webrtc import VideoTransformerBase, webrtc_streamer, RTCConfiguration
 from transformers import pipeline
 from PIL import Image
-import cv2
 import numpy as np
 import time
-# Load TinyLLaVA pipeline once
 pipe = pipeline(
     task="image-to-text",
     model="bczhou/tiny-llava-v1-hf",
@@ -14,53 +16,37 @@ pipe = pipeline(
     device_map="cpu"
 )
-st.set_page_config(page_title="TinyLLaVA Webcam", layout="centered")
-st.title("🦙 TinyLLaVA — Webcam Captioning")
-# Shared state
-st_frame = st.empty()
-result_box = st.empty()
-class VideoProcessor(VideoTransformerBase):
-    def __init__(self):
-        self.last_run = 0
-        self.interval = 5  # seconds
-        self.last_caption = ""
-    def transform(self, frame):
-        img = frame.to_ndarray(format="bgr24")
-        now = time.time()
-        if now - self.last_run > self.interval:
-            self.last_run = now
-            # Convert BGR to RGB
-            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(img_rgb)
-            # Run TinyLLaVA pipeline
-            prompt = "Describe this scene in detail."
-            query = f"USER: <image>\n{prompt}\nASSISTANT:"
-            with st.spinner("TinyLLaVA is thinking..."):
-                result = pipe(query, pil_image)
-                self.last_caption = result[0]["generated_text"]
-        # Return the same frame, unmodified
-        return img
-# RTC config
-rtc_config = RTCConfiguration(
-    {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
-)
-webrtc_ctx = webrtc_streamer(
-    key="example",
-    video_processor_factory=VideoProcessor,
-    rtc_configuration=rtc_config,
-    media_stream_constraints={"video": True, "audio": False}
-)
-if webrtc_ctx.video_processor:
-    st.info("Keep your webcam on. The app captures 1 frame every 5 seconds and generates a caption.")
-    st.write("Latest Caption:")
-    st.write(webrtc_ctx.video_processor.last_caption)

 import streamlit as st
+import cv2
 from transformers import pipeline
 from PIL import Image
 import numpy as np
 import time
+st.set_page_config(page_title="🎥 TinyLLaVA CCTV Alternative", layout="wide")
+st.title("🧠 TinyLLaVA — Webcam Frame-by-Frame (No WebRTC)")
+# Load TinyLLaVA pipeline
 pipe = pipeline(
     task="image-to-text",
     model="bczhou/tiny-llava-v1-hf",
     device_map="cpu"
 )
+# OpenCV webcam
+cap = cv2.VideoCapture(0)
+FRAME_INTERVAL = 30  # process every 30 frames
+frame_placeholder = st.empty()
+caption_placeholder = st.empty()
+frame_count = 0
+last_caption = ""
+while cap.isOpened():
+    ret, frame = cap.read()
+    if not ret:
+        st.warning("No webcam feed")
+        break
+    frame = cv2.flip(frame, 1)  # selfie view
+    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    frame_placeholder.image(rgb, channels="RGB", use_column_width=True)
+    # every FRAME_INTERVAL frames → run TinyLLaVA
+    if frame_count % FRAME_INTERVAL == 0:
+        pil_image = Image.fromarray(rgb)
+        prompt = "Describe this scene in detail."
+        query = f"USER: <image>\n{prompt}\nASSISTANT:"
+        result = pipe(query, pil_image)
+        last_caption = result[0]["generated_text"]
+    caption_placeholder.markdown(f"**Latest:** {last_caption}")
+    frame_count += 1
+    # Slow down loop to save CPU (adjust if needed)
+    time.sleep(0.1)