WaysAheadGlobal commited on
Commit
4265501
·
verified ·
1 Parent(s): 1c5a277

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -22
app.py CHANGED
@@ -2,49 +2,61 @@
2
 
3
  import gradio as gr
4
  import torch
5
- from transformers import BlipProcessor, BlipForConditionalGeneration
6
  import cv2
7
  from PIL import Image
 
8
 
9
- # Load BLIP captioning model
10
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
11
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 
12
 
13
  device = torch.device("cpu")
14
  model.to(device)
15
 
16
- # Live webcam captioning generator
17
- def webcam_caption():
18
- cap = cv2.VideoCapture(0) # open webcam
 
 
 
19
  while True:
20
  ret, frame = cap.read()
21
  if not ret:
22
  break
23
 
24
- # Convert OpenCV frame (BGR) to RGB PIL Image
25
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
26
- image = Image.fromarray(frame_rgb)
27
 
28
- # Generate caption
29
- inputs = processor(images=image, return_tensors="pt").to(device)
30
- out = model.generate(**inputs, max_new_tokens=50)
31
- caption = processor.decode(out[0], skip_special_tokens=True)
32
 
33
- yield frame_rgb, caption
 
 
 
 
 
 
 
 
34
 
35
  cap.release()
36
 
37
- # Gradio interface
38
  with gr.Blocks() as demo:
39
- gr.Markdown("## 🎥 Live Webcam BLIP Captioning (CPU)")
40
- video = gr.Image(label="Webcam Stream")
41
- text = gr.Textbox(label="Caption")
 
42
 
43
  demo.load(
44
- fn=webcam_caption,
45
  inputs=None,
46
- outputs=[video, text],
47
- every=2 # call generator every 2 sec (adjust if you want)
48
  )
49
 
50
  demo.launch()
 
2
 
3
  import gradio as gr
4
  import torch
 
5
  import cv2
6
  from PIL import Image
7
+ from transformers import LlavaProcessor, LlavaForConditionalGeneration
8
 
9
+ # Load LLaVA model (MiniGPT-4 style)
10
+ model_id = "llava-hf/llava-1.5-7b-hf"
11
+ processor = LlavaProcessor.from_pretrained(model_id)
12
+ model = LlavaForConditionalGeneration.from_pretrained(model_id)
13
 
14
  device = torch.device("cpu")
15
  model.to(device)
16
 
17
+ # Function: read webcam, yield frame + LLaVA caption every few seconds
18
+ def webcam_llava():
19
+ cap = cv2.VideoCapture(0)
20
+ if not cap.isOpened():
21
+ raise RuntimeError("Webcam could not be opened.")
22
+
23
  while True:
24
  ret, frame = cap.read()
25
  if not ret:
26
  break
27
 
28
+ # Convert BGR to RGB PIL
29
+ rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
30
+ pil_image = Image.fromarray(rgb_frame)
31
 
32
+ # --- Compose prompt for LLaVA ---
33
+ prompt = "<image>\nUSER: Describe this scene in detail.\nASSISTANT:"
34
+ inputs = processor(prompt, pil_image, return_tensors="pt").to(device)
 
35
 
36
+ # Generate
37
+ output = model.generate(**inputs, max_new_tokens=200)
38
+ caption = processor.decode(output[0], skip_special_tokens=True)
39
+
40
+ # Yield current frame + caption
41
+ yield rgb_frame, caption
42
+
43
+ # Wait before next frame (adjust as needed)
44
+ cv2.waitKey(10000) # 10 seconds for CPU safety
45
 
46
  cap.release()
47
 
48
+ # Gradio app
49
  with gr.Blocks() as demo:
50
+ gr.Markdown("# 🎥 LLaVA MiniGPT-4 Webcam Captioning\n_(CPU, slow but descriptive)_")
51
+
52
+ webcam_display = gr.Image(label="Live Webcam")
53
+ description = gr.Textbox(label="LLaVA Caption")
54
 
55
  demo.load(
56
+ fn=webcam_llava,
57
  inputs=None,
58
+ outputs=[webcam_display, description],
59
+ every=1
60
  )
61
 
62
  demo.launch()