video_analyzer / video_analyzer_keyframes.py
cngsm's picture
Update video_analyzer_keyframes.py
c0c39ac verified
import moviepy.editor as mp
import cv2
import numpy as np
from PIL import Image
import os
import json
from datetime import datetime
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
# BLIP setup
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
class VideoSceneAnalyzer:
def __init__(self, video_path, scene_duration=8):
self.video_path = video_path
self.scene_duration = scene_duration
self.clip = None
self.video_info = {}
self.scenes = []
def load_video(self):
try:
self.clip = mp.VideoFileClip(self.video_path)
self.video_info = {
'duration': self.clip.duration,
'fps': self.clip.fps,
'size': self.clip.size,
'aspect_ratio': self.clip.size[0] / self.clip.size[1],
'total_frames': int(self.clip.duration * self.clip.fps)
}
return True
except Exception as e:
print(f"Erro ao carregar vídeo: {e}")
return False
def describe_image_and_generate_prompt(self, frame, idx):
tmp_path = f"temp_{idx:02d}.jpg"
Image.fromarray(np.uint8(frame)).save(tmp_path)
img = Image.open(tmp_path).convert("RGB")
inputs = blip_processor(images=img, return_tensors="pt")
out = blip_model.generate(**inputs)
cap = blip_processor.decode(out[0], skip_special_tokens=True).strip().capitalize()
descricao = (
f"Cena {idx}: {cap}. "
"Registrada ao entardecer com luz dourada suave, profundidade de campo rasa desfocando o fundo. "
"O ambiente apresenta uma paisagem urbana vibrante com vegetação e construções ao longe, "
"enquanto o sujeito exibe expressão serena e postura confiante. "
"Atmosfera de leve nostalgia e contemplação."
)
prompt = (
f"A cinematic, warm golden-hour shot of {cap}. "
"Captured with a 35mm lens at f/1.8 for shallow depth of field, "
"soft backlighting, and gentle handheld motion. "
"Background features a vibrant urban setting with trees and distant buildings, "
"evoking a sense of nostalgia and calm. "
"--ar 16:9 --v 6 --style photorealistic --quality 2"
)
negative = (
"--no (CGI artifacts, plastic textures, overexposed skies, cartoonish colors, static poses, low detail)"
)
return descricao, prompt, negative
def extract_keyframes(self, threshold=30.0):
output_dir = f"keyframes_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
os.makedirs(output_dir, exist_ok=True)
cap = cv2.VideoCapture(self.video_path)
prev = None
count = 0
success, frame = cap.read()
self.scenes = []
while success:
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
diff = np.inf if prev is None else np.mean(cv2.absdiff(gray, prev))
if prev is None or diff > threshold:
timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000.0
img_path = os.path.join(output_dir, f"keyframe_{count+1:02d}.jpg")
Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)).save(img_path)
desc, prmpt, neg = self.describe_image_and_generate_prompt(
cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), count+1
)
self.scenes.append({
'scene_number': count+1,
'time': timestamp,
'image_path': img_path,
'descricao_detalhada': desc,
'prompt_ia': prmpt,
'negative_prompt': neg
})
count += 1
prev = gray
success, frame = cap.read()
cap.release()
return True
def save_results(self, output_file=None):
if not output_file:
output_file = f"video_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
results = {
'video_info': self.video_info,
'analysis_type': 'keyframe_extraction',
'scenes': self.scenes,
'generated_at': datetime.now().isoformat()
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
return output_file
def cleanup(self):
if self.clip:
self.clip.close()