import moviepy.editor as mp import cv2 import numpy as np from PIL import Image import os import json from datetime import datetime from transformers import BlipProcessor, BlipForConditionalGeneration import torch # BLIP setup blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") class VideoSceneAnalyzer: def __init__(self, video_path, scene_duration=8): self.video_path = video_path self.scene_duration = scene_duration self.clip = None self.video_info = {} self.scenes = [] def load_video(self): try: self.clip = mp.VideoFileClip(self.video_path) self.video_info = { 'duration': self.clip.duration, 'fps': self.clip.fps, 'size': self.clip.size, 'aspect_ratio': self.clip.size[0] / self.clip.size[1], 'total_frames': int(self.clip.duration * self.clip.fps) } return True except Exception as e: print(f"Erro ao carregar vídeo: {e}") return False def describe_image_and_generate_prompt(self, frame, idx): tmp_path = f"temp_{idx:02d}.jpg" Image.fromarray(np.uint8(frame)).save(tmp_path) img = Image.open(tmp_path).convert("RGB") inputs = blip_processor(images=img, return_tensors="pt") out = blip_model.generate(**inputs) cap = blip_processor.decode(out[0], skip_special_tokens=True).strip().capitalize() descricao = ( f"Cena {idx}: {cap}. " "Registrada ao entardecer com luz dourada suave, profundidade de campo rasa desfocando o fundo. " "O ambiente apresenta uma paisagem urbana vibrante com vegetação e construções ao longe, " "enquanto o sujeito exibe expressão serena e postura confiante. " "Atmosfera de leve nostalgia e contemplação." ) prompt = ( f"A cinematic, warm golden-hour shot of {cap}. " "Captured with a 35mm lens at f/1.8 for shallow depth of field, " "soft backlighting, and gentle handheld motion. " "Background features a vibrant urban setting with trees and distant buildings, " "evoking a sense of nostalgia and calm. " "--ar 16:9 --v 6 --style photorealistic --quality 2" ) negative = ( "--no (CGI artifacts, plastic textures, overexposed skies, cartoonish colors, static poses, low detail)" ) return descricao, prompt, negative def extract_keyframes(self, threshold=30.0): output_dir = f"keyframes_{datetime.now().strftime('%Y%m%d_%H%M%S')}" os.makedirs(output_dir, exist_ok=True) cap = cv2.VideoCapture(self.video_path) prev = None count = 0 success, frame = cap.read() self.scenes = [] while success: gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) diff = np.inf if prev is None else np.mean(cv2.absdiff(gray, prev)) if prev is None or diff > threshold: timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000.0 img_path = os.path.join(output_dir, f"keyframe_{count+1:02d}.jpg") Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)).save(img_path) desc, prmpt, neg = self.describe_image_and_generate_prompt( cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), count+1 ) self.scenes.append({ 'scene_number': count+1, 'time': timestamp, 'image_path': img_path, 'descricao_detalhada': desc, 'prompt_ia': prmpt, 'negative_prompt': neg }) count += 1 prev = gray success, frame = cap.read() cap.release() return True def save_results(self, output_file=None): if not output_file: output_file = f"video_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" results = { 'video_info': self.video_info, 'analysis_type': 'keyframe_extraction', 'scenes': self.scenes, 'generated_at': datetime.now().isoformat() } with open(output_file, 'w', encoding='utf-8') as f: json.dump(results, f, indent=2, ensure_ascii=False) return output_file def cleanup(self): if self.clip: self.clip.close()