WaysAheadGlobal commited on
Commit
f9d091a
Β·
verified Β·
1 Parent(s): 0932151

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -26
app.py CHANGED
@@ -1,42 +1,67 @@
1
  import gradio as gr
2
  import cv2
 
3
  from PIL import Image
4
  from transformers import Blip2Processor, Blip2ForConditionalGeneration
5
  import torch
 
6
 
7
- # Load BLIP-2 FLAN-T5 model (CPU-compatible)
8
  processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
9
  model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")
10
 
11
- # Function to capture frame and generate caption
12
- def describe_live_frame():
13
- cap = cv2.VideoCapture(0) # 0 = default webcam
14
- if not cap.isOpened():
15
- return None, "❌ Cannot access camera. Try reconnecting or use a different device."
16
-
17
- ret, frame = cap.read()
18
- cap.release()
19
- if not ret:
20
- return None, "❌ Failed to capture frame."
21
-
22
- # Convert OpenCV frame to PIL
23
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
24
- image = Image.fromarray(frame_rgb)
25
-
26
- # Run BLIP-2 captioning
27
  inputs = processor(images=image, return_tensors="pt")
28
  generated_ids = model.generate(**inputs, max_new_tokens=50)
29
  caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- return image, caption
 
 
 
 
 
 
 
 
 
 
32
 
33
- # Gradio interface
34
- with gr.Blocks() as demo:
35
- gr.Markdown("## 🧠 Live Scene Captioning (Simulated Real-Time)\nBLIP-2 FLAN-T5 – CPU Friendly")
36
- btn = gr.Button("πŸ“Έ Capture & Describe Scene")
37
- img_output = gr.Image(label="Captured Frame")
38
- text_output = gr.Textbox(label="Scene Description")
39
 
40
- btn.click(fn=describe_live_frame, inputs=[], outputs=[img_output, text_output])
 
 
 
 
41
 
42
- demo.launch()
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import cv2
3
+ import tempfile
4
  from PIL import Image
5
  from transformers import Blip2Processor, Blip2ForConditionalGeneration
6
  import torch
7
+ import os
8
 
9
+ # Load BLIP-2 model (FLAN-T5 - CPU friendly)
10
  processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
11
  model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")
12
 
13
+ def describe_image(image):
14
+ image = image.convert("RGB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  inputs = processor(images=image, return_tensors="pt")
16
  generated_ids = model.generate(**inputs, max_new_tokens=50)
17
  caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
18
+ return caption
19
+
20
+ def extract_video_frames(video_path, interval=30):
21
+ cap = cv2.VideoCapture(video_path)
22
+ frames = []
23
+ count = 0
24
+ success = True
25
+ while success:
26
+ success, frame = cap.read()
27
+ if not success:
28
+ break
29
+ if count % interval == 0:
30
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
31
+ frames.append((count, Image.fromarray(frame_rgb)))
32
+ count += 1
33
+ cap.release()
34
+ return frames
35
 
36
+ def handle_upload(file):
37
+ name = file.name.lower()
38
+ if name.endswith((".jpg", ".jpeg", ".png")):
39
+ image = Image.open(file)
40
+ caption = describe_image(image)
41
+ return f"πŸ–ΌοΈ Image Caption:\n{caption}"
42
+
43
+ elif name.endswith((".mp4", ".mov", ".avi", ".mkv")):
44
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
45
+ tmp.write(file.read())
46
+ tmp_path = tmp.name
47
 
48
+ frames = extract_video_frames(tmp_path, interval=30) # 1 fps
49
+ captions = []
50
+ for idx, frame in frames:
51
+ caption = describe_image(frame)
52
+ captions.append(f"πŸ•’ Frame {idx}: {caption}")
 
53
 
54
+ os.remove(tmp_path)
55
+ return "\n".join(captions)
56
+
57
+ else:
58
+ return "❌ Unsupported file type. Please upload an image or video."
59
 
60
+ # Gradio UI
61
+ gr.Interface(
62
+ fn=handle_upload,
63
+ inputs=gr.File(label="Upload Image or Video"),
64
+ outputs=gr.Textbox(label="Scene Descriptions"),
65
+ title="🧠 Scene Understanding AI – BLIP-2 (Image + Video)",
66
+ description="Upload a photo or video. The AI will describe the scene(s) using BLIP-2 (FLAN-T5). Works on CPU."
67
+ ).launch()