""" Extract keyframes from videos and upload to S3-compatible storage (Backblaze B2). For each video that has captions but no keyframes in storage: 1. Get the frame_ids from video_captions 2. Get the video source (local file or S3) 3. Extract those exact frames using OpenCV 4. Upload to S3 at {video_id}/frame_XXXXXX.jpg """ import os import sys import io import tempfile import cv2 from pymongo import MongoClient from minio import Minio from dotenv import load_dotenv load_dotenv() MONGO_URI = os.getenv("MONGO_URI", "mongodb+srv://detectifai_user:DetectifAI123@cluster0.6f9uj.mongodb.net/detectifai?retryWrites=true&w=majority&appName=Cluster0") client = MongoClient(MONGO_URI) db = client.detectifai minio_client = Minio( os.getenv('MINIO_ENDPOINT', 's3.eu-central-003.backblazeb2.com'), access_key=os.getenv('MINIO_ACCESS_KEY', '00367479ffb7e4e0000000001'), secret_key=os.getenv('MINIO_SECRET_KEY', 'K003opTvf92ijRj5dM7H1dgrlwcGTdA'), secure=os.getenv('MINIO_SECURE', 'true').lower() == 'true', region=os.getenv('MINIO_REGION', 'eu-central-003') or None ) KEYFRAME_BUCKET = os.getenv('MINIO_KEYFRAME_BUCKET', 'detectifai-keyframes') VIDEO_BUCKET = os.getenv('MINIO_VIDEO_BUCKET', 'detectifai-videos') BASE_DIR = os.getenv('BASE_DIR', r"d:\FAST\Final Year Project\sem1_finalized_malaika\sem1") def get_video_source(video_id): """Return path to video file. Download from MinIO if not local.""" # Check local uploads first local_path = os.path.join(BASE_DIR, "uploads", video_id, "video.mp4") if os.path.isfile(local_path) and os.path.getsize(local_path) > 0: print(f" Using local file: {local_path}") return local_path # Check MinIO rec = db.video_file.find_one({"video_id": video_id}, {"minio_object_key": 1, "minio_bucket": 1}) if rec and rec.get("minio_object_key"): bucket = rec.get("minio_bucket", VIDEO_BUCKET) obj_key = rec["minio_object_key"] # Verify the object actually exists before downloading try: minio_client.stat_object(bucket, obj_key) except Exception: print(f" MinIO object not found: {bucket}/{obj_key}") return None print(f" Downloading from MinIO: {bucket}/{obj_key}") tmp_path = os.path.join(tempfile.gettempdir(), f"{video_id}.mp4") minio_client.fget_object(bucket, obj_key, tmp_path) print(f" Downloaded to: {tmp_path}") return tmp_path return None import numpy as np def upload_placeholder_keyframes(video_id, frame_ids): """Generate and upload placeholder keyframe images for videos whose source is gone.""" uploaded = 0 for frame_id in frame_ids: # Get the caption text for this frame to display on placeholder caption_doc = db.video_captions.find_one( {"video_id": video_id, "frame_id": frame_id}, {"caption": 1, "_id": 0} ) caption_text = caption_doc.get("caption", "No caption") if caption_doc else "No caption" # Create a 640x360 dark gradient placeholder image img = np.zeros((360, 640, 3), dtype=np.uint8) # Dark blue gradient for y in range(360): val = int(30 + (y / 360) * 40) img[y, :] = [val, int(val * 0.8), int(val * 0.5)] # Add text font = cv2.FONT_HERSHEY_SIMPLEX # Video ID cv2.putText(img, video_id, (20, 40), font, 0.5, (150, 150, 150), 1) # Frame ID cv2.putText(img, frame_id, (20, 70), font, 0.5, (150, 150, 150), 1) # Camera icon placeholder cv2.rectangle(img, (270, 130), (370, 210), (80, 80, 80), 2) cv2.putText(img, "VIDEO", (284, 178), font, 0.6, (120, 120, 120), 1) # Caption (wrap if long) words = caption_text[:80].split() line = "" y_pos = 250 for w in words: test = line + " " + w if line else w if len(test) > 50: cv2.putText(img, line, (20, y_pos), font, 0.4, (200, 200, 200), 1) y_pos += 22 line = w else: line = test if line: cv2.putText(img, line, (20, y_pos), font, 0.4, (200, 200, 200), 1) # Encode as JPEG success, buffer = cv2.imencode('.jpg', img, [cv2.IMWRITE_JPEG_QUALITY, 85]) if not success: continue minio_path = f"{video_id}/{frame_id}.jpg" data = io.BytesIO(buffer.tobytes()) minio_client.put_object( KEYFRAME_BUCKET, minio_path, data, length=len(buffer.tobytes()), content_type='image/jpeg' ) uploaded += 1 return uploaded def extract_and_upload_keyframes(video_id, frame_ids): """Extract specific frames from video and upload to MinIO.""" video_path = get_video_source(video_id) if not video_path: print(f" No video source found — generating placeholder keyframes") return upload_placeholder_keyframes(video_id, frame_ids) # Parse frame numbers from frame_ids like "frame_000060" frame_numbers = {} for fid in frame_ids: try: num = int(fid.replace("frame_", "")) frame_numbers[num] = fid except ValueError: print(f" WARNING: Could not parse frame_id: {fid}") if not frame_numbers: print(f" No valid frame numbers to extract") return 0 cap = cv2.VideoCapture(video_path) if not cap.isOpened(): print(f" ERROR: Could not open video: {video_path}") return 0 total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) fps = cap.get(cv2.CAP_PROP_FPS) print(f" Video: {total_frames} frames, {fps:.1f} fps") uploaded = 0 max_frame = max(frame_numbers.keys()) for frame_num in sorted(frame_numbers.keys()): if frame_num >= total_frames: # Use last available frame frame_num_actual = total_frames - 1 print(f" Frame {frame_num} beyond total ({total_frames}), using frame {frame_num_actual}") else: frame_num_actual = frame_num cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num_actual) ret, frame = cap.read() if not ret: print(f" ERROR: Could not read frame {frame_num_actual}") continue # Encode as JPEG success, buffer = cv2.imencode('.jpg', frame, [cv2.IMWRITE_JPEG_QUALITY, 85]) if not success: print(f" ERROR: Could not encode frame {frame_num}") continue frame_id = frame_numbers[frame_num] minio_path = f"{video_id}/{frame_id}.jpg" # Upload to MinIO data = io.BytesIO(buffer.tobytes()) minio_client.put_object( KEYFRAME_BUCKET, minio_path, data, length=len(buffer.tobytes()), content_type='image/jpeg' ) uploaded += 1 cap.release() # Clean up temp file if downloaded from MinIO tmp_path = os.path.join(tempfile.gettempdir(), f"{video_id}.mp4") if os.path.exists(tmp_path) and video_path == tmp_path: os.remove(tmp_path) return uploaded def main(): # Get all video_ids with captions caption_vids = db.video_captions.distinct("video_id") for video_id in caption_vids: if video_id.startswith("test_"): continue # Check if keyframes already exist in MinIO existing = list(minio_client.list_objects(KEYFRAME_BUCKET, prefix=f"{video_id}/", recursive=True)) if len(existing) > 0: print(f"SKIP {video_id}: already has {len(existing)} keyframes in MinIO") continue # Get frame_ids from captions frame_ids = db.video_captions.distinct("frame_id", {"video_id": video_id}) if not frame_ids: print(f"SKIP {video_id}: no frame_ids in captions") continue print(f"\nPROCESSING {video_id}: {len(frame_ids)} frames to extract") uploaded = extract_and_upload_keyframes(video_id, frame_ids) print(f" Uploaded {uploaded}/{len(frame_ids)} keyframes to MinIO") print("\n=== DONE ===") # Final check for video_id in caption_vids: if video_id.startswith("test_"): continue objs = list(minio_client.list_objects(KEYFRAME_BUCKET, prefix=f"{video_id}/", recursive=True)) print(f" {video_id}: {len(objs)} keyframes in MinIO") if __name__ == "__main__": main()