Spaces:

WaysAheadGlobal
/

Blip

Build error

App Files Files Community

Blip / app.py

WaysAheadGlobal

Update app.py

f9d091a verified 6 months ago

raw

history blame

2.28 kB

	import gradio as gr
	import cv2
	import tempfile
	from PIL import Image
	from transformers import Blip2Processor, Blip2ForConditionalGeneration
	import torch
	import os

	# Load BLIP-2 model (FLAN-T5 - CPU friendly)
	processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
	model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")

	def describe_image(image):
	image = image.convert("RGB")
	inputs = processor(images=image, return_tensors="pt")
	generated_ids = model.generate(**inputs, max_new_tokens=50)
	caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
	return caption

	def extract_video_frames(video_path, interval=30):
	cap = cv2.VideoCapture(video_path)
	frames = []
	count = 0
	success = True
	while success:
	success, frame = cap.read()
	if not success:
	break
	if count % interval == 0:
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frames.append((count, Image.fromarray(frame_rgb)))
	count += 1
	cap.release()
	return frames

	def handle_upload(file):
	name = file.name.lower()
	if name.endswith((".jpg", ".jpeg", ".png")):
	image = Image.open(file)
	caption = describe_image(image)
	return f"🖼️ Image Caption:\n{caption}"

	elif name.endswith((".mp4", ".mov", ".avi", ".mkv")):
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
	tmp.write(file.read())
	tmp_path = tmp.name

	frames = extract_video_frames(tmp_path, interval=30) # 1 fps
	captions = []
	for idx, frame in frames:
	caption = describe_image(frame)
	captions.append(f"🕒 Frame {idx}: {caption}")

	os.remove(tmp_path)
	return "\n".join(captions)

	else:
	return "❌ Unsupported file type. Please upload an image or video."

	# Gradio UI
	gr.Interface(
	fn=handle_upload,
	inputs=gr.File(label="Upload Image or Video"),
	outputs=gr.Textbox(label="Scene Descriptions"),
	title="🧠 Scene Understanding AI – BLIP-2 (Image + Video)",
	description="Upload a photo or video. The AI will describe the scene(s) using BLIP-2 (FLAN-T5). Works on CPU."
	).launch()