File size: 8,885 Bytes
fd50325 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | """
Extract keyframes from videos and upload to S3-compatible storage (Backblaze B2).
For each video that has captions but no keyframes in storage:
1. Get the frame_ids from video_captions
2. Get the video source (local file or S3)
3. Extract those exact frames using OpenCV
4. Upload to S3 at {video_id}/frame_XXXXXX.jpg
"""
import os
import sys
import io
import tempfile
import cv2
from pymongo import MongoClient
from minio import Minio
from dotenv import load_dotenv
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI", "mongodb+srv://detectifai_user:DetectifAI123@cluster0.6f9uj.mongodb.net/detectifai?retryWrites=true&w=majority&appName=Cluster0")
client = MongoClient(MONGO_URI)
db = client.detectifai
minio_client = Minio(
os.getenv('MINIO_ENDPOINT', 's3.eu-central-003.backblazeb2.com'),
access_key=os.getenv('MINIO_ACCESS_KEY', '00367479ffb7e4e0000000001'),
secret_key=os.getenv('MINIO_SECRET_KEY', 'K003opTvf92ijRj5dM7H1dgrlwcGTdA'),
secure=os.getenv('MINIO_SECURE', 'true').lower() == 'true',
region=os.getenv('MINIO_REGION', 'eu-central-003') or None
)
KEYFRAME_BUCKET = os.getenv('MINIO_KEYFRAME_BUCKET', 'detectifai-keyframes')
VIDEO_BUCKET = os.getenv('MINIO_VIDEO_BUCKET', 'detectifai-videos')
BASE_DIR = os.getenv('BASE_DIR', r"d:\FAST\Final Year Project\sem1_finalized_malaika\sem1")
def get_video_source(video_id):
"""Return path to video file. Download from MinIO if not local."""
# Check local uploads first
local_path = os.path.join(BASE_DIR, "uploads", video_id, "video.mp4")
if os.path.isfile(local_path) and os.path.getsize(local_path) > 0:
print(f" Using local file: {local_path}")
return local_path
# Check MinIO
rec = db.video_file.find_one({"video_id": video_id}, {"minio_object_key": 1, "minio_bucket": 1})
if rec and rec.get("minio_object_key"):
bucket = rec.get("minio_bucket", VIDEO_BUCKET)
obj_key = rec["minio_object_key"]
# Verify the object actually exists before downloading
try:
minio_client.stat_object(bucket, obj_key)
except Exception:
print(f" MinIO object not found: {bucket}/{obj_key}")
return None
print(f" Downloading from MinIO: {bucket}/{obj_key}")
tmp_path = os.path.join(tempfile.gettempdir(), f"{video_id}.mp4")
minio_client.fget_object(bucket, obj_key, tmp_path)
print(f" Downloaded to: {tmp_path}")
return tmp_path
return None
import numpy as np
def upload_placeholder_keyframes(video_id, frame_ids):
"""Generate and upload placeholder keyframe images for videos whose source is gone."""
uploaded = 0
for frame_id in frame_ids:
# Get the caption text for this frame to display on placeholder
caption_doc = db.video_captions.find_one(
{"video_id": video_id, "frame_id": frame_id},
{"caption": 1, "_id": 0}
)
caption_text = caption_doc.get("caption", "No caption") if caption_doc else "No caption"
# Create a 640x360 dark gradient placeholder image
img = np.zeros((360, 640, 3), dtype=np.uint8)
# Dark blue gradient
for y in range(360):
val = int(30 + (y / 360) * 40)
img[y, :] = [val, int(val * 0.8), int(val * 0.5)]
# Add text
font = cv2.FONT_HERSHEY_SIMPLEX
# Video ID
cv2.putText(img, video_id, (20, 40), font, 0.5, (150, 150, 150), 1)
# Frame ID
cv2.putText(img, frame_id, (20, 70), font, 0.5, (150, 150, 150), 1)
# Camera icon placeholder
cv2.rectangle(img, (270, 130), (370, 210), (80, 80, 80), 2)
cv2.putText(img, "VIDEO", (284, 178), font, 0.6, (120, 120, 120), 1)
# Caption (wrap if long)
words = caption_text[:80].split()
line = ""
y_pos = 250
for w in words:
test = line + " " + w if line else w
if len(test) > 50:
cv2.putText(img, line, (20, y_pos), font, 0.4, (200, 200, 200), 1)
y_pos += 22
line = w
else:
line = test
if line:
cv2.putText(img, line, (20, y_pos), font, 0.4, (200, 200, 200), 1)
# Encode as JPEG
success, buffer = cv2.imencode('.jpg', img, [cv2.IMWRITE_JPEG_QUALITY, 85])
if not success:
continue
minio_path = f"{video_id}/{frame_id}.jpg"
data = io.BytesIO(buffer.tobytes())
minio_client.put_object(
KEYFRAME_BUCKET, minio_path, data,
length=len(buffer.tobytes()),
content_type='image/jpeg'
)
uploaded += 1
return uploaded
def extract_and_upload_keyframes(video_id, frame_ids):
"""Extract specific frames from video and upload to MinIO."""
video_path = get_video_source(video_id)
if not video_path:
print(f" No video source found — generating placeholder keyframes")
return upload_placeholder_keyframes(video_id, frame_ids)
# Parse frame numbers from frame_ids like "frame_000060"
frame_numbers = {}
for fid in frame_ids:
try:
num = int(fid.replace("frame_", ""))
frame_numbers[num] = fid
except ValueError:
print(f" WARNING: Could not parse frame_id: {fid}")
if not frame_numbers:
print(f" No valid frame numbers to extract")
return 0
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f" ERROR: Could not open video: {video_path}")
return 0
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
print(f" Video: {total_frames} frames, {fps:.1f} fps")
uploaded = 0
max_frame = max(frame_numbers.keys())
for frame_num in sorted(frame_numbers.keys()):
if frame_num >= total_frames:
# Use last available frame
frame_num_actual = total_frames - 1
print(f" Frame {frame_num} beyond total ({total_frames}), using frame {frame_num_actual}")
else:
frame_num_actual = frame_num
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num_actual)
ret, frame = cap.read()
if not ret:
print(f" ERROR: Could not read frame {frame_num_actual}")
continue
# Encode as JPEG
success, buffer = cv2.imencode('.jpg', frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
if not success:
print(f" ERROR: Could not encode frame {frame_num}")
continue
frame_id = frame_numbers[frame_num]
minio_path = f"{video_id}/{frame_id}.jpg"
# Upload to MinIO
data = io.BytesIO(buffer.tobytes())
minio_client.put_object(
KEYFRAME_BUCKET,
minio_path,
data,
length=len(buffer.tobytes()),
content_type='image/jpeg'
)
uploaded += 1
cap.release()
# Clean up temp file if downloaded from MinIO
tmp_path = os.path.join(tempfile.gettempdir(), f"{video_id}.mp4")
if os.path.exists(tmp_path) and video_path == tmp_path:
os.remove(tmp_path)
return uploaded
def main():
# Get all video_ids with captions
caption_vids = db.video_captions.distinct("video_id")
for video_id in caption_vids:
if video_id.startswith("test_"):
continue
# Check if keyframes already exist in MinIO
existing = list(minio_client.list_objects(KEYFRAME_BUCKET, prefix=f"{video_id}/", recursive=True))
if len(existing) > 0:
print(f"SKIP {video_id}: already has {len(existing)} keyframes in MinIO")
continue
# Get frame_ids from captions
frame_ids = db.video_captions.distinct("frame_id", {"video_id": video_id})
if not frame_ids:
print(f"SKIP {video_id}: no frame_ids in captions")
continue
print(f"\nPROCESSING {video_id}: {len(frame_ids)} frames to extract")
uploaded = extract_and_upload_keyframes(video_id, frame_ids)
print(f" Uploaded {uploaded}/{len(frame_ids)} keyframes to MinIO")
print("\n=== DONE ===")
# Final check
for video_id in caption_vids:
if video_id.startswith("test_"):
continue
objs = list(minio_client.list_objects(KEYFRAME_BUCKET, prefix=f"{video_id}/", recursive=True))
print(f" {video_id}: {len(objs)} keyframes in MinIO")
if __name__ == "__main__":
main()
|