Spaces:
Build error
Build error
| # app.py | |
| import gradio as gr | |
| import torch | |
| import cv2 | |
| from PIL import Image | |
| from transformers import LlavaProcessor, LlavaForConditionalGeneration | |
| # Load LLaVA model (MiniGPT-4 style) | |
| model_id = "llava-hf/llava-1.5-7b-hf" | |
| processor = LlavaProcessor.from_pretrained(model_id) | |
| model = LlavaForConditionalGeneration.from_pretrained(model_id) | |
| device = torch.device("cpu") | |
| model.to(device) | |
| # Function: read webcam, yield frame + LLaVA caption every few seconds | |
| def webcam_llava(): | |
| cap = cv2.VideoCapture(0) | |
| if not cap.isOpened(): | |
| raise RuntimeError("Webcam could not be opened.") | |
| while True: | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| # Convert BGR to RGB PIL | |
| rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
| pil_image = Image.fromarray(rgb_frame) | |
| # --- Compose prompt for LLaVA --- | |
| prompt = "<image>\nUSER: Describe this scene in detail.\nASSISTANT:" | |
| inputs = processor(prompt, pil_image, return_tensors="pt").to(device) | |
| # Generate | |
| output = model.generate(**inputs, max_new_tokens=200) | |
| caption = processor.decode(output[0], skip_special_tokens=True) | |
| # Yield current frame + caption | |
| yield rgb_frame, caption | |
| # Wait before next frame (adjust as needed) | |
| cv2.waitKey(10000) # 10 seconds for CPU safety | |
| cap.release() | |
| # Gradio app | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π₯ LLaVA MiniGPT-4 Webcam Captioning\n_(CPU, slow but descriptive)_") | |
| webcam_display = gr.Image(label="Live Webcam") | |
| description = gr.Textbox(label="LLaVA Caption") | |
| demo.load( | |
| fn=webcam_llava, | |
| inputs=None, | |
| outputs=[webcam_display, description], | |
| every=1 | |
| ) | |
| demo.launch() | |