vp / app.py
cngsm's picture
Upload 2 files
e7e10d9 verified
import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import cv2
import os
# Carrega BLIP v1 base (leve, roda no CPU FREE)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
def process_video(video_path):
# Cria diretório para frames
os.makedirs("frames", exist_ok=True)
# Abre o vídeo
vidcap = cv2.VideoCapture(video_path)
fps = vidcap.get(cv2.CAP_PROP_FPS)
total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
duration = total_frames / fps
# Identifica os segmentos de 4s
segments = []
current = 0
while current < duration:
start = current
end = min(current + 4, duration)
segments.append((start, end))
current += 4
descriptions = []
for i, (start, end) in enumerate(segments):
# Frame central
center_time = (start + end) / 2
center_frame = int(center_time * fps)
vidcap.set(cv2.CAP_PROP_POS_FRAMES, center_frame)
success, frame = vidcap.read()
if success:
img_path = f"frames/frame_{i}.jpg"
cv2.imwrite(img_path, frame)
# BLIP descreve
pil_image = Image.open(img_path).convert("RGB")
inputs = processor(images=pil_image, return_tensors="pt")
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)
descriptions.append(f"Segmento {i+1} ({start:.1f}-{end:.1f}s): {caption}")
vidcap.release()
# Gera prompts combinados de 8s
prompts = []
for j in range(0, len(descriptions), 2):
combined = " ".join(descriptions[j:j+2])
prompts.append(f"Prompt {j//2 +1}: {combined}")
return "\n".join(descriptions) + "\n\n" + "\n".join(prompts)
iface = gr.Interface(
fn=process_video,
inputs=gr.Video(),
outputs="text",
title="Video Analyzer with BLIP (CPU Friendly)",
description="Faz análise de frames centrais de segmentos de 4s e gera prompts combinados de 8s."
)
iface.launch()