import torch import clip import cv2 from PIL import Image from concurrent.futures import ThreadPoolExecutor device = "cpu" model, preprocess = clip.load("ViT-B/16", device=device) model.eval() SUPPORTED_IMAGE_EXTS = (".jpg", ".jpeg", ".png") SUPPORTED_VIDEO_EXTS = (".mp4", ".avi", ".mov") def preprocess_image(path): img = Image.open(path).convert("RGB") return preprocess(img) def load_images_parallel(image_paths, max_workers=4): with ThreadPoolExecutor(max_workers=max_workers) as executor: images = list(executor.map(preprocess_image, image_paths)) return torch.stack(images) def images_to_embeddings_cpu(image_paths, batch_size=32): outputs = [] for i in range(0, len(image_paths), batch_size): batch_paths = image_paths[i:i + batch_size] images = load_images_parallel(batch_paths) with torch.no_grad(): emb = model.encode_image(images) emb = emb / emb.norm(dim=-1, keepdim=True) outputs.append(emb) return torch.cat(outputs).numpy() def extract_frames(video_path, sample_rate=1): cap = cv2.VideoCapture(video_path) fps = cap.get(cv2.CAP_PROP_FPS) interval = int(max(1, fps * sample_rate)) frames = [] count = 0 while cap.isOpened(): ret, frame = cap.read() if not ret: break if count % interval == 0: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frames.append(Image.fromarray(frame)) count += 1 cap.release() return frames def video_to_embedding_cpu(video_path): frames = extract_frames(video_path) if not frames: return None images = torch.stack([preprocess(f) for f in frames]) with torch.no_grad(): emb = model.encode_image(images) emb = emb / emb.norm(dim=-1, keepdim=True) return emb.mean(dim=0).numpy() def process_videos_parallel(video_files, max_workers=2): with ThreadPoolExecutor(max_workers=max_workers) as executor: return list(executor.map(video_to_embedding_cpu, video_files)) def process_inputs(files): image_files = [] video_files = [] for f in files: f_lower = f.lower() if f_lower.endswith(SUPPORTED_IMAGE_EXTS): image_files.append(f) elif f_lower.endswith(SUPPORTED_VIDEO_EXTS): video_files.append(f) results = {} if image_files: results["images"] = images_to_embeddings_cpu(image_files) if video_files: results["videos"] = process_videos_parallel(video_files) return results