File size: 1,761 Bytes
1c5a277
 
a3895ed
1c5a277
a3895ed
 
4265501
a3895ed
4265501
 
 
 
a3895ed
1c5a277
 
f9d091a
4265501
 
 
 
 
 
1c5a277
 
 
f9d091a
0932151
4265501
 
 
1c5a277
4265501
 
 
1c5a277
4265501
 
 
 
 
 
 
 
 
1c5a277
 
a3895ed
4265501
1c5a277
4265501
 
 
 
a3895ed
1c5a277
4265501
1c5a277
4265501
 
1c5a277
a3895ed
1c5a277
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# app.py

import gradio as gr
import torch
import cv2
from PIL import Image
from transformers import LlavaProcessor, LlavaForConditionalGeneration

# Load LLaVA model (MiniGPT-4 style)
model_id = "llava-hf/llava-1.5-7b-hf"
processor = LlavaProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id)

device = torch.device("cpu")
model.to(device)

# Function: read webcam, yield frame + LLaVA caption every few seconds
def webcam_llava():
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        raise RuntimeError("Webcam could not be opened.")
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Convert BGR to RGB PIL
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(rgb_frame)

        # --- Compose prompt for LLaVA ---
        prompt = "<image>\nUSER: Describe this scene in detail.\nASSISTANT:"
        inputs = processor(prompt, pil_image, return_tensors="pt").to(device)

        # Generate
        output = model.generate(**inputs, max_new_tokens=200)
        caption = processor.decode(output[0], skip_special_tokens=True)

        # Yield current frame + caption
        yield rgb_frame, caption

        # Wait before next frame (adjust as needed)
        cv2.waitKey(10000)  # 10 seconds for CPU safety

    cap.release()

# Gradio app
with gr.Blocks() as demo:
    gr.Markdown("# 🎥 LLaVA MiniGPT-4 Webcam Captioning\n_(CPU, slow but descriptive)_")

    webcam_display = gr.Image(label="Live Webcam")
    description = gr.Textbox(label="LLaVA Caption")

    demo.load(
        fn=webcam_llava,
        inputs=None,
        outputs=[webcam_display, description],
        every=1
    )

demo.launch()