Blip / app.py
WaysAheadGlobal's picture
Update app.py
4265501 verified
raw
history blame
1.76 kB
# app.py
import gradio as gr
import torch
import cv2
from PIL import Image
from transformers import LlavaProcessor, LlavaForConditionalGeneration
# Load LLaVA model (MiniGPT-4 style)
model_id = "llava-hf/llava-1.5-7b-hf"
processor = LlavaProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id)
device = torch.device("cpu")
model.to(device)
# Function: read webcam, yield frame + LLaVA caption every few seconds
def webcam_llava():
cap = cv2.VideoCapture(0)
if not cap.isOpened():
raise RuntimeError("Webcam could not be opened.")
while True:
ret, frame = cap.read()
if not ret:
break
# Convert BGR to RGB PIL
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(rgb_frame)
# --- Compose prompt for LLaVA ---
prompt = "<image>\nUSER: Describe this scene in detail.\nASSISTANT:"
inputs = processor(prompt, pil_image, return_tensors="pt").to(device)
# Generate
output = model.generate(**inputs, max_new_tokens=200)
caption = processor.decode(output[0], skip_special_tokens=True)
# Yield current frame + caption
yield rgb_frame, caption
# Wait before next frame (adjust as needed)
cv2.waitKey(10000) # 10 seconds for CPU safety
cap.release()
# Gradio app
with gr.Blocks() as demo:
gr.Markdown("# πŸŽ₯ LLaVA MiniGPT-4 Webcam Captioning\n_(CPU, slow but descriptive)_")
webcam_display = gr.Image(label="Live Webcam")
description = gr.Textbox(label="LLaVA Caption")
demo.load(
fn=webcam_llava,
inputs=None,
outputs=[webcam_display, description],
every=1
)
demo.launch()