Spaces:

WaysAheadGlobal
/

Blip

Build error

Blip / app.py

Update app.py

4265501 verified 6 months ago

1.76 kB

	# app.py

	import gradio as gr
	import torch
	import cv2
	from PIL import Image
	from transformers import LlavaProcessor, LlavaForConditionalGeneration

	# Load LLaVA model (MiniGPT-4 style)
	model_id = "llava-hf/llava-1.5-7b-hf"
	processor = LlavaProcessor.from_pretrained(model_id)
	model = LlavaForConditionalGeneration.from_pretrained(model_id)

	device = torch.device("cpu")
	model.to(device)

	# Function: read webcam, yield frame + LLaVA caption every few seconds
	def webcam_llava():
	cap = cv2.VideoCapture(0)
	if not cap.isOpened():
	raise RuntimeError("Webcam could not be opened.")

	while True:
	ret, frame = cap.read()
	if not ret:
	break

	# Convert BGR to RGB PIL
	rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	pil_image = Image.fromarray(rgb_frame)

	# --- Compose prompt for LLaVA ---
	prompt = "<image>\nUSER: Describe this scene in detail.\nASSISTANT:"
	inputs = processor(prompt, pil_image, return_tensors="pt").to(device)

	# Generate
	output = model.generate(**inputs, max_new_tokens=200)
	caption = processor.decode(output[0], skip_special_tokens=True)

	# Yield current frame + caption
	yield rgb_frame, caption

	# Wait before next frame (adjust as needed)
	cv2.waitKey(10000) # 10 seconds for CPU safety

	cap.release()

	# Gradio app
	with gr.Blocks() as demo:
	gr.Markdown("# 🎥 LLaVA MiniGPT-4 Webcam Captioning\n_(CPU, slow but descriptive)_")

	webcam_display = gr.Image(label="Live Webcam")
	description = gr.Textbox(label="LLaVA Caption")

	demo.load(
	fn=webcam_llava,
	inputs=None,
	outputs=[webcam_display, description],
	every=1
	)

	demo.launch()