Spaces:
Build error
Build error
| import gradio as gr | |
| import cv2 | |
| import tempfile | |
| from PIL import Image | |
| from transformers import Blip2Processor, Blip2ForConditionalGeneration | |
| import torch | |
| import os | |
| # Load BLIP-2 model (FLAN-T5 - CPU friendly) | |
| processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") | |
| model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl") | |
| def describe_image(image): | |
| image = image.convert("RGB") | |
| inputs = processor(images=image, return_tensors="pt") | |
| generated_ids = model.generate(**inputs, max_new_tokens=50) | |
| caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() | |
| return caption | |
| def extract_video_frames(video_path, interval=30): | |
| cap = cv2.VideoCapture(video_path) | |
| frames = [] | |
| count = 0 | |
| success = True | |
| while success: | |
| success, frame = cap.read() | |
| if not success: | |
| break | |
| if count % interval == 0: | |
| frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
| frames.append((count, Image.fromarray(frame_rgb))) | |
| count += 1 | |
| cap.release() | |
| return frames | |
| def handle_upload(file): | |
| name = file.name.lower() | |
| if name.endswith((".jpg", ".jpeg", ".png")): | |
| image = Image.open(file) | |
| caption = describe_image(image) | |
| return f"πΌοΈ Image Caption:\n{caption}" | |
| elif name.endswith((".mp4", ".mov", ".avi", ".mkv")): | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp: | |
| tmp.write(file.read()) | |
| tmp_path = tmp.name | |
| frames = extract_video_frames(tmp_path, interval=30) # 1 fps | |
| captions = [] | |
| for idx, frame in frames: | |
| caption = describe_image(frame) | |
| captions.append(f"π Frame {idx}: {caption}") | |
| os.remove(tmp_path) | |
| return "\n".join(captions) | |
| else: | |
| return "β Unsupported file type. Please upload an image or video." | |
| # Gradio UI | |
| gr.Interface( | |
| fn=handle_upload, | |
| inputs=gr.File(label="Upload Image or Video"), | |
| outputs=gr.Textbox(label="Scene Descriptions"), | |
| title="π§ Scene Understanding AI β BLIP-2 (Image + Video)", | |
| description="Upload a photo or video. The AI will describe the scene(s) using BLIP-2 (FLAN-T5). Works on CPU." | |
| ).launch() |