import torch
import clip
import cv2
from PIL import Image
from concurrent.futures import ThreadPoolExecutor

device = "cpu"

model, preprocess = clip.load("ViT-B/16", device=device)
model.eval()

SUPPORTED_IMAGE_EXTS = (".jpg", ".jpeg", ".png")
SUPPORTED_VIDEO_EXTS = (".mp4", ".avi", ".mov")

def preprocess_image(path):
    img = Image.open(path).convert("RGB")
    return preprocess(img)

def load_images_parallel(image_paths, max_workers=4):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        images = list(executor.map(preprocess_image, image_paths))
    return torch.stack(images)

def images_to_embeddings_cpu(image_paths, batch_size=32):
    outputs = []

    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i + batch_size]
        images = load_images_parallel(batch_paths)

        with torch.no_grad():
            emb = model.encode_image(images)
            emb = emb / emb.norm(dim=-1, keepdim=True)
        outputs.append(emb)
    return torch.cat(outputs).numpy()

def extract_frames(video_path, sample_rate=1):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    interval = int(max(1, fps * sample_rate))

    frames = []
    count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if count % interval == 0:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(Image.fromarray(frame))

        count += 1

    cap.release()
    return frames


def video_to_embedding_cpu(video_path):
    frames = extract_frames(video_path)

    if not frames:
        return None

    images = torch.stack([preprocess(f) for f in frames])

    with torch.no_grad():
        emb = model.encode_image(images)
        emb = emb / emb.norm(dim=-1, keepdim=True)

    return emb.mean(dim=0).numpy()


def process_videos_parallel(video_files, max_workers=2):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        return list(executor.map(video_to_embedding_cpu, video_files))


def process_inputs(files):
    image_files = []
    video_files = []

    for f in files:
        f_lower = f.lower()
        if f_lower.endswith(SUPPORTED_IMAGE_EXTS):
            image_files.append(f)
        elif f_lower.endswith(SUPPORTED_VIDEO_EXTS):
            video_files.append(f)

    results = {}
    if image_files:
        results["images"] = images_to_embeddings_cpu(image_files)
    if video_files:
        results["videos"] = process_videos_parallel(video_files)
    return results