Spaces:

WaysAheadGlobal
/

Blip

Build error

App Files Files Community

WaysAheadGlobal commited on Jun 28

Commit

4265501

verified ·

1 Parent(s): 1c5a277

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -22

app.py CHANGED Viewed

@@ -2,49 +2,61 @@
 import gradio as gr
 import torch
-from transformers import BlipProcessor, BlipForConditionalGeneration
 import cv2
 from PIL import Image
-# Load BLIP captioning model
-processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 device = torch.device("cpu")
 model.to(device)
-# Live webcam captioning generator
-def webcam_caption():
-    cap = cv2.VideoCapture(0)  # open webcam
     while True:
         ret, frame = cap.read()
         if not ret:
             break
-        # Convert OpenCV frame (BGR) to RGB PIL Image
-        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        image = Image.fromarray(frame_rgb)
-        # Generate caption
-        inputs = processor(images=image, return_tensors="pt").to(device)
-        out = model.generate(**inputs, max_new_tokens=50)
-        caption = processor.decode(out[0], skip_special_tokens=True)
-        yield frame_rgb, caption
     cap.release()
-# Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("## 🎥 Live Webcam BLIP Captioning (CPU)")
-    video = gr.Image(label="Webcam Stream")
-    text = gr.Textbox(label="Caption")
     demo.load(
-        fn=webcam_caption,
         inputs=None,
-        outputs=[video, text],
-        every=2  # call generator every 2 sec (adjust if you want)
     )
 demo.launch()

 import gradio as gr
 import torch
 import cv2
 from PIL import Image
+from transformers import LlavaProcessor, LlavaForConditionalGeneration
+# Load LLaVA model (MiniGPT-4 style)
+model_id = "llava-hf/llava-1.5-7b-hf"
+processor = LlavaProcessor.from_pretrained(model_id)
+model = LlavaForConditionalGeneration.from_pretrained(model_id)
 device = torch.device("cpu")
 model.to(device)
+# Function: read webcam, yield frame + LLaVA caption every few seconds
+def webcam_llava():
+    cap = cv2.VideoCapture(0)
+    if not cap.isOpened():
+        raise RuntimeError("Webcam could not be opened.")
     while True:
         ret, frame = cap.read()
         if not ret:
             break
+        # Convert BGR to RGB PIL
+        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        pil_image = Image.fromarray(rgb_frame)
+        # --- Compose prompt for LLaVA ---
+        prompt = "<image>\nUSER: Describe this scene in detail.\nASSISTANT:"
+        inputs = processor(prompt, pil_image, return_tensors="pt").to(device)
+        # Generate
+        output = model.generate(**inputs, max_new_tokens=200)
+        caption = processor.decode(output[0], skip_special_tokens=True)
+        # Yield current frame + caption
+        yield rgb_frame, caption
+        # Wait before next frame (adjust as needed)
+        cv2.waitKey(10000)  # 10 seconds for CPU safety
     cap.release()
+# Gradio app
 with gr.Blocks() as demo:
+    gr.Markdown("# 🎥 LLaVA MiniGPT-4 Webcam Captioning\n_(CPU, slow but descriptive)_")
+    webcam_display = gr.Image(label="Live Webcam")
+    description = gr.Textbox(label="LLaVA Caption")
     demo.load(
+        fn=webcam_llava,
         inputs=None,
+        outputs=[webcam_display, description],
+        every=1
     )
 demo.launch()