| import torch |
| import clip |
| import cv2 |
| from PIL import Image |
| from concurrent.futures import ThreadPoolExecutor |
|
|
| device = "cpu" |
|
|
| model, preprocess = clip.load("ViT-B/16", device=device) |
| model.eval() |
|
|
| SUPPORTED_IMAGE_EXTS = (".jpg", ".jpeg", ".png") |
| SUPPORTED_VIDEO_EXTS = (".mp4", ".avi", ".mov") |
|
|
| def preprocess_image(path): |
| img = Image.open(path).convert("RGB") |
| return preprocess(img) |
|
|
| def load_images_parallel(image_paths, max_workers=4): |
| with ThreadPoolExecutor(max_workers=max_workers) as executor: |
| images = list(executor.map(preprocess_image, image_paths)) |
| return torch.stack(images) |
|
|
| def images_to_embeddings_cpu(image_paths, batch_size=32): |
| outputs = [] |
|
|
| for i in range(0, len(image_paths), batch_size): |
| batch_paths = image_paths[i:i + batch_size] |
| images = load_images_parallel(batch_paths) |
|
|
| with torch.no_grad(): |
| emb = model.encode_image(images) |
| emb = emb / emb.norm(dim=-1, keepdim=True) |
| outputs.append(emb) |
| return torch.cat(outputs).numpy() |
|
|
| def extract_frames(video_path, sample_rate=1): |
| cap = cv2.VideoCapture(video_path) |
| fps = cap.get(cv2.CAP_PROP_FPS) |
| interval = int(max(1, fps * sample_rate)) |
|
|
| frames = [] |
| count = 0 |
|
|
| while cap.isOpened(): |
| ret, frame = cap.read() |
| if not ret: |
| break |
|
|
| if count % interval == 0: |
| frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
| frames.append(Image.fromarray(frame)) |
|
|
| count += 1 |
|
|
| cap.release() |
| return frames |
|
|
|
|
| def video_to_embedding_cpu(video_path): |
| frames = extract_frames(video_path) |
|
|
| if not frames: |
| return None |
|
|
| images = torch.stack([preprocess(f) for f in frames]) |
|
|
| with torch.no_grad(): |
| emb = model.encode_image(images) |
| emb = emb / emb.norm(dim=-1, keepdim=True) |
|
|
| return emb.mean(dim=0).numpy() |
|
|
|
|
| def process_videos_parallel(video_files, max_workers=2): |
| with ThreadPoolExecutor(max_workers=max_workers) as executor: |
| return list(executor.map(video_to_embedding_cpu, video_files)) |
|
|
|
|
| def process_inputs(files): |
| image_files = [] |
| video_files = [] |
|
|
| for f in files: |
| f_lower = f.lower() |
| if f_lower.endswith(SUPPORTED_IMAGE_EXTS): |
| image_files.append(f) |
| elif f_lower.endswith(SUPPORTED_VIDEO_EXTS): |
| video_files.append(f) |
|
|
| results = {} |
| if image_files: |
| results["images"] = images_to_embeddings_cpu(image_files) |
| if video_files: |
| results["videos"] = process_videos_parallel(video_files) |
| return results |
|
|
|
|
|
|