Spaces:
Build error
Build error
File size: 2,280 Bytes
a3895ed f9d091a a3895ed f9d091a a3895ed f9d091a a3895ed f9d091a a3895ed f9d091a 0932151 f9d091a a3895ed f9d091a a3895ed f9d091a a3895ed f9d091a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import gradio as gr
import cv2
import tempfile
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
import os
# Load BLIP-2 model (FLAN-T5 - CPU friendly)
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")
def describe_image(image):
image = image.convert("RGB")
inputs = processor(images=image, return_tensors="pt")
generated_ids = model.generate(**inputs, max_new_tokens=50)
caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
return caption
def extract_video_frames(video_path, interval=30):
cap = cv2.VideoCapture(video_path)
frames = []
count = 0
success = True
while success:
success, frame = cap.read()
if not success:
break
if count % interval == 0:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frames.append((count, Image.fromarray(frame_rgb)))
count += 1
cap.release()
return frames
def handle_upload(file):
name = file.name.lower()
if name.endswith((".jpg", ".jpeg", ".png")):
image = Image.open(file)
caption = describe_image(image)
return f"πΌοΈ Image Caption:\n{caption}"
elif name.endswith((".mp4", ".mov", ".avi", ".mkv")):
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
tmp.write(file.read())
tmp_path = tmp.name
frames = extract_video_frames(tmp_path, interval=30) # 1 fps
captions = []
for idx, frame in frames:
caption = describe_image(frame)
captions.append(f"π Frame {idx}: {caption}")
os.remove(tmp_path)
return "\n".join(captions)
else:
return "β Unsupported file type. Please upload an image or video."
# Gradio UI
gr.Interface(
fn=handle_upload,
inputs=gr.File(label="Upload Image or Video"),
outputs=gr.Textbox(label="Scene Descriptions"),
title="π§ Scene Understanding AI β BLIP-2 (Image + Video)",
description="Upload a photo or video. The AI will describe the scene(s) using BLIP-2 (FLAN-T5). Works on CPU."
).launch() |