# app.py import gradio as gr import torch import cv2 from PIL import Image from transformers import LlavaProcessor, LlavaForConditionalGeneration # Load LLaVA model (MiniGPT-4 style) model_id = "llava-hf/llava-1.5-7b-hf" processor = LlavaProcessor.from_pretrained(model_id) model = LlavaForConditionalGeneration.from_pretrained(model_id) device = torch.device("cpu") model.to(device) # Function: read webcam, yield frame + LLaVA caption every few seconds def webcam_llava(): cap = cv2.VideoCapture(0) if not cap.isOpened(): raise RuntimeError("Webcam could not be opened.") while True: ret, frame = cap.read() if not ret: break # Convert BGR to RGB PIL rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) pil_image = Image.fromarray(rgb_frame) # --- Compose prompt for LLaVA --- prompt = "\nUSER: Describe this scene in detail.\nASSISTANT:" inputs = processor(prompt, pil_image, return_tensors="pt").to(device) # Generate output = model.generate(**inputs, max_new_tokens=200) caption = processor.decode(output[0], skip_special_tokens=True) # Yield current frame + caption yield rgb_frame, caption # Wait before next frame (adjust as needed) cv2.waitKey(10000) # 10 seconds for CPU safety cap.release() # Gradio app with gr.Blocks() as demo: gr.Markdown("# 🎥 LLaVA MiniGPT-4 Webcam Captioning\n_(CPU, slow but descriptive)_") webcam_display = gr.Image(label="Live Webcam") description = gr.Textbox(label="LLaVA Caption") demo.load( fn=webcam_llava, inputs=None, outputs=[webcam_display, description], every=1 ) demo.launch()