Spaces:
Build error
Build error
File size: 1,761 Bytes
1c5a277 a3895ed 1c5a277 a3895ed 4265501 a3895ed 4265501 a3895ed 1c5a277 f9d091a 4265501 1c5a277 f9d091a 0932151 4265501 1c5a277 4265501 1c5a277 4265501 1c5a277 a3895ed 4265501 1c5a277 4265501 a3895ed 1c5a277 4265501 1c5a277 4265501 1c5a277 a3895ed 1c5a277 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# app.py
import gradio as gr
import torch
import cv2
from PIL import Image
from transformers import LlavaProcessor, LlavaForConditionalGeneration
# Load LLaVA model (MiniGPT-4 style)
model_id = "llava-hf/llava-1.5-7b-hf"
processor = LlavaProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id)
device = torch.device("cpu")
model.to(device)
# Function: read webcam, yield frame + LLaVA caption every few seconds
def webcam_llava():
cap = cv2.VideoCapture(0)
if not cap.isOpened():
raise RuntimeError("Webcam could not be opened.")
while True:
ret, frame = cap.read()
if not ret:
break
# Convert BGR to RGB PIL
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(rgb_frame)
# --- Compose prompt for LLaVA ---
prompt = "<image>\nUSER: Describe this scene in detail.\nASSISTANT:"
inputs = processor(prompt, pil_image, return_tensors="pt").to(device)
# Generate
output = model.generate(**inputs, max_new_tokens=200)
caption = processor.decode(output[0], skip_special_tokens=True)
# Yield current frame + caption
yield rgb_frame, caption
# Wait before next frame (adjust as needed)
cv2.waitKey(10000) # 10 seconds for CPU safety
cap.release()
# Gradio app
with gr.Blocks() as demo:
gr.Markdown("# 🎥 LLaVA MiniGPT-4 Webcam Captioning\n_(CPU, slow but descriptive)_")
webcam_display = gr.Image(label="Live Webcam")
description = gr.Textbox(label="LLaVA Caption")
demo.load(
fn=webcam_llava,
inputs=None,
outputs=[webcam_display, description],
every=1
)
demo.launch()
|