| | import sys |
| | import os |
| |
|
| | |
| | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "utils")) |
| |
|
| | import math |
| | import time |
| | import pickle |
| | import wandb |
| | import numpy as np |
| | import torch |
| | import torch.nn as nn |
| | from typing import Optional |
| | from transformers import CLIPProcessor, CLIPModel |
| | from sklearn.preprocessing import LabelEncoder |
| | from sklearn.model_selection import train_test_split |
| | from transformers import ( |
| | CLIPVisionModelWithProjection, |
| | CLIPTokenizer, |
| | CLIPTextModelWithProjection, |
| | ) |
| | from transformers import AutoProcessor, AutoModel |
| |
|
| | import core.vision_encoder.pe as pe |
| | import core.vision_encoder.transforms as pe_transformer |
| |
|
| | import clip |
| |
|
| |
|
| | from video_embedder import VideoEmbedder |
| |
|
| |
|
| | |
| | def set_seed(seed=42): |
| | np.random.seed(seed) |
| | torch.manual_seed(seed) |
| | torch.cuda.manual_seed_all(seed) |
| |
|
| |
|
| | set_seed(42) |
| |
|
| | |
| | DATASET_MAP = { |
| | "breakfast": "Breakfast", |
| | "ucf101": "UCF101", |
| | "hmdb": "HMDB", |
| | "ssv2": "Something2", |
| | "jester": "Jester", |
| | } |
| |
|
| |
|
| | def process_dataset(dataset_key, clip_model, window_size=16, random=True, |
| | batch_size: int = 256, |
| | pe_video_batch_size: Optional[int] = None, |
| | pe_target_T: Optional[int] = None, |
| | enable_tf32: bool = True): |
| | dataset_name = DATASET_MAP.get(dataset_key.lower()) |
| | if dataset_name is None: |
| | raise ValueError(f"Unknown dataset: {dataset_key}") |
| |
|
| | folder_path = [f"../Datasets/{dataset_name}/Video_data"] |
| | output_dir = "../Embeddings/Datasets" |
| | embedd_path = f"../Embeddings/Videos/{dataset_name}/{random}_{window_size}_clip_{clip_model}.pkl" |
| |
|
| | |
| | if enable_tf32 and torch.cuda.is_available(): |
| | torch.backends.cuda.matmul.allow_tf32 = True |
| | torch.backends.cudnn.allow_tf32 = True |
| |
|
| | |
| | if clip_model == "b32": |
| | model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").eval() |
| | processor = CLIPProcessor.from_pretrained( |
| | "openai/clip-vit-base-patch32", use_fast=True |
| | ) |
| | elif clip_model == "b16": |
| | model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").eval() |
| | processor = CLIPProcessor.from_pretrained( |
| | "openai/clip-vit-base-patch16", use_fast=True |
| | ) |
| | elif clip_model == "l14": |
| | model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").eval() |
| | processor = CLIPProcessor.from_pretrained( |
| | "openai/clip-vit-large-patch14", use_fast=True |
| | ) |
| | elif clip_model == "res50": |
| | |
| | model, processor = clip.load("RN50", device="cuda") |
| | elif clip_model == "clip4clip": |
| | |
| | model = CLIPVisionModelWithProjection.from_pretrained( |
| | "Searchium-ai/clip4clip-webvid150k" |
| | ) |
| | model = model.eval() |
| | clip_full = CLIPModel.from_pretrained( |
| | "Searchium-ai/clip4clip-webvid150k" |
| | ) |
| |
|
| | model_text = CLIPTextModelWithProjection.from_pretrained( |
| | "Searchium-ai/clip4clip-webvid150k" |
| | ) |
| | processor = CLIPTokenizer.from_pretrained( |
| | "Searchium-ai/clip4clip-webvid150k" |
| | ) |
| | elif clip_model == "siglip": |
| | model = AutoModel.from_pretrained("google/siglip-base-patch16-224") |
| | processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224") |
| | elif clip_model == "siglip2": |
| | model = AutoModel.from_pretrained("google/siglip2-base-patch32-256") |
| | processor = AutoProcessor.from_pretrained("google/siglip2-base-patch32-256") |
| | elif clip_model == "pe-l14": |
| | model = pe.CLIP.from_config("PE-Core-L14-336") |
| |
|
| | processor = pe_transformer.get_image_transform(model.image_size) |
| | tokenizer = pe_transformer.get_text_tokenizer(model.context_length) |
| | else: |
| | raise ValueError(f"Unknown CLIP model: {clip_model}") |
| |
|
| | |
| | if ( |
| | clip_model == "clip4clip" |
| | or clip_model == "siglip" |
| | or clip_model == "siglip2" |
| | or clip_model == "res50" |
| | or clip_model == "pe-l14" |
| | ): |
| | embedder = VideoEmbedder( |
| | clip_model, model, processor, |
| | pe_video_batch_size=pe_video_batch_size, |
| | pe_target_T=pe_target_T, |
| | ) |
| | else: |
| | embedder = VideoEmbedder("clip", model, processor) |
| | embedder.dataset_name = dataset_key |
| |
|
| | if os.path.exists(embedd_path): |
| | try: |
| | with open(embedd_path, "rb") as f: |
| | embedder = pickle.load(f) |
| | print(f"Loaded existing embedder from {embedd_path}") |
| | except FileNotFoundError: |
| | print("Embedder file not found, creating a new one.") |
| | else: |
| | embedder.process_data( |
| | folder_path, |
| | window_size=window_size, |
| | output_path=output_dir, |
| | random=random, |
| | save_intermediate=True, |
| | batch_size=batch_size, |
| | ) |
| | os.makedirs(os.path.dirname(embedd_path), exist_ok=True) |
| | with open(embedd_path, "wb") as f: |
| | pickle.dump(embedder, f) |
| |
|
| |
|
| | |
| |
|
| | window_size = 32 |
| | clip_model = "pe-l14" |
| | |
| | process_dataset( |
| | "breakfast", |
| | clip_model, |
| | window_size=window_size, |
| | random=True, |
| | batch_size=256, |
| | pe_video_batch_size=24, |
| | pe_target_T=8, |
| | enable_tf32=True, |
| | ) |
| |
|