Spaces:

WaysAheadGlobal
/

Blip

Build error

File size: 2,280 Bytes

a3895ed
 
f9d091a
a3895ed
 
 
f9d091a
a3895ed
f9d091a
a3895ed
 
 
f9d091a
 
a3895ed
 
 
f9d091a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0932151
f9d091a
 
 
 
 
 
 
 
 
 
 
a3895ed
f9d091a
 
 
 
 
a3895ed
f9d091a
 
 
 
 
a3895ed
f9d091a

import gradio as gr
import cv2
import tempfile
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
import os

# Load BLIP-2 model (FLAN-T5 - CPU friendly)
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")

def describe_image(image):
    image = image.convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    generated_ids = model.generate(**inputs, max_new_tokens=50)
    caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return caption

def extract_video_frames(video_path, interval=30):
    cap = cv2.VideoCapture(video_path)
    frames = []
    count = 0
    success = True
    while success:
        success, frame = cap.read()
        if not success:
            break
        if count % interval == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append((count, Image.fromarray(frame_rgb)))
        count += 1
    cap.release()
    return frames

def handle_upload(file):
    name = file.name.lower()
    if name.endswith((".jpg", ".jpeg", ".png")):
        image = Image.open(file)
        caption = describe_image(image)
        return f"🖼️ Image Caption:\n{caption}"
    
    elif name.endswith((".mp4", ".mov", ".avi", ".mkv")):
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
            tmp.write(file.read())
            tmp_path = tmp.name

        frames = extract_video_frames(tmp_path, interval=30)  # 1 fps
        captions = []
        for idx, frame in frames:
            caption = describe_image(frame)
            captions.append(f"🕒 Frame {idx}: {caption}")

        os.remove(tmp_path)
        return "\n".join(captions)
    
    else:
        return "❌ Unsupported file type. Please upload an image or video."

# Gradio UI
gr.Interface(
    fn=handle_upload,
    inputs=gr.File(label="Upload Image or Video"),
    outputs=gr.Textbox(label="Scene Descriptions"),
    title="🧠 Scene Understanding AI – BLIP-2 (Image + Video)",
    description="Upload a photo or video. The AI will describe the scene(s) using BLIP-2 (FLAN-T5). Works on CPU."
).launch()