File size: 2,280 Bytes
a3895ed
 
f9d091a
a3895ed
 
 
f9d091a
a3895ed
f9d091a
a3895ed
 
 
f9d091a
 
a3895ed
 
 
f9d091a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0932151
f9d091a
 
 
 
 
 
 
 
 
 
 
a3895ed
f9d091a
 
 
 
 
a3895ed
f9d091a
 
 
 
 
a3895ed
f9d091a
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
import cv2
import tempfile
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
import os

# Load BLIP-2 model (FLAN-T5 - CPU friendly)
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")

def describe_image(image):
    image = image.convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    generated_ids = model.generate(**inputs, max_new_tokens=50)
    caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return caption

def extract_video_frames(video_path, interval=30):
    cap = cv2.VideoCapture(video_path)
    frames = []
    count = 0
    success = True
    while success:
        success, frame = cap.read()
        if not success:
            break
        if count % interval == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append((count, Image.fromarray(frame_rgb)))
        count += 1
    cap.release()
    return frames

def handle_upload(file):
    name = file.name.lower()
    if name.endswith((".jpg", ".jpeg", ".png")):
        image = Image.open(file)
        caption = describe_image(image)
        return f"πŸ–ΌοΈ Image Caption:\n{caption}"
    
    elif name.endswith((".mp4", ".mov", ".avi", ".mkv")):
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
            tmp.write(file.read())
            tmp_path = tmp.name

        frames = extract_video_frames(tmp_path, interval=30)  # 1 fps
        captions = []
        for idx, frame in frames:
            caption = describe_image(frame)
            captions.append(f"πŸ•’ Frame {idx}: {caption}")

        os.remove(tmp_path)
        return "\n".join(captions)
    
    else:
        return "❌ Unsupported file type. Please upload an image or video."

# Gradio UI
gr.Interface(
    fn=handle_upload,
    inputs=gr.File(label="Upload Image or Video"),
    outputs=gr.Textbox(label="Scene Descriptions"),
    title="🧠 Scene Understanding AI – BLIP-2 (Image + Video)",
    description="Upload a photo or video. The AI will describe the scene(s) using BLIP-2 (FLAN-T5). Works on CPU."
).launch()