Spaces:

WaysAheadGlobal
/

VLM

Sleeping

App Files Files Community

WaysAheadGlobal commited on Jun 28

Commit

24e5396

verified ·

1 Parent(s): 3da8dd4

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -17

app.py CHANGED Viewed

@@ -1,13 +1,12 @@
-# app.py
 import streamlit as st
 from transformers import pipeline
 from PIL import Image
-import requests
-st.set_page_config(page_title="TinyLLaVA (Streamlit)", layout="centered")
-st.title("🦙 TinyLLaVA — Vision-Language Q&A")
 pipe = pipeline(
     task="image-to-text",
     model="bczhou/tiny-llava-v1-hf",
@@ -15,17 +14,53 @@ pipe = pipeline(
     device_map="cpu"
 )
-uploaded_file = st.file_uploader("📷 Upload an image", type=["jpg","png","jpeg"])
-prompt = st.text_input("💬 Ask a question (post `<image>` token):", value="What is happening?")
-if uploaded_file and prompt:
-    image = Image.open(uploaded_file).convert("RGB")
-    st.image(image, caption="Uploaded Image", use_column_width=True)
-    query = f"USER: <image>\n{prompt}\nASSISTANT:"
-    with st.spinner("Generating answer..."):
-        result = pipe(query, image)
-        answer = result[0]["generated_text"]
-    st.subheader("📝 Answer:")
-    st.write(answer)

 import streamlit as st
+from streamlit_webrtc import VideoTransformerBase, webrtc_streamer, RTCConfiguration
 from transformers import pipeline
 from PIL import Image
+import cv2
+import numpy as np
+import time
+# Load TinyLLaVA pipeline once
 pipe = pipeline(
     task="image-to-text",
     model="bczhou/tiny-llava-v1-hf",
     device_map="cpu"
 )
+st.set_page_config(page_title="TinyLLaVA Webcam", layout="centered")
+st.title("🦙 TinyLLaVA — Webcam Captioning")
+# Shared state
+st_frame = st.empty()
+result_box = st.empty()
+class VideoProcessor(VideoTransformerBase):
+    def __init__(self):
+        self.last_run = 0
+        self.interval = 5  # seconds
+        self.last_caption = ""
+    def transform(self, frame):
+        img = frame.to_ndarray(format="bgr24")
+        now = time.time()
+        if now - self.last_run > self.interval:
+            self.last_run = now
+            # Convert BGR to RGB
+            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(img_rgb)
+            # Run TinyLLaVA pipeline
+            prompt = "Describe this scene in detail."
+            query = f"USER: <image>\n{prompt}\nASSISTANT:"
+            with st.spinner("TinyLLaVA is thinking..."):
+                result = pipe(query, pil_image)
+                self.last_caption = result[0]["generated_text"]
+        # Return the same frame, unmodified
+        return img
+# RTC config
+rtc_config = RTCConfiguration(
+    {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
+)
+webrtc_ctx = webrtc_streamer(
+    key="example",
+    video_processor_factory=VideoProcessor,
+    rtc_configuration=rtc_config,
+    media_stream_constraints={"video": True, "audio": False}
+)
+if webrtc_ctx.video_processor:
+    st.info("Keep your webcam on. The app captures 1 frame every 5 seconds and generates a caption.")
+    st.write("Latest Caption:")
+    st.write(webrtc_ctx.video_processor.last_caption)