# app.py import gradio as gr import torch from transformers import BlipProcessor, BlipForConditionalGeneration import cv2 from PIL import Image # Load BLIP captioning model processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") device = torch.device("cpu") model.to(device) # Live webcam captioning generator def webcam_caption(): cap = cv2.VideoCapture(0) # open webcam while True: ret, frame = cap.read() if not ret: break # Convert OpenCV frame (BGR) to RGB PIL Image frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) image = Image.fromarray(frame_rgb) # Generate caption inputs = processor(images=image, return_tensors="pt").to(device) out = model.generate(**inputs, max_new_tokens=50) caption = processor.decode(out[0], skip_special_tokens=True) yield frame_rgb, caption cap.release() # Gradio interface with gr.Blocks() as demo: gr.Markdown("## 🎥 Live Webcam BLIP Captioning (CPU)") video = gr.Image(label="Webcam Stream") text = gr.Textbox(label="Caption") demo.load( fn=webcam_caption, inputs=None, outputs=[video, text], every=2 # call generator every 2 sec (adjust if you want) ) demo.launch()