| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | """ |
| | python clip_script.py --captions_path /path/to/coco2014_val/captions \ |
| | --fid_images_path /path/to/synthetic_images \ |
| | --output_path /path/to/output/clip_scores.csv |
| | |
| | 1. `--captions_path`: The path to the real images captions directory. In this example, |
| | it is set to `/path/to/coco2014_val/captions`. This path should point to the |
| | directory containing the COCO 2014 validation dataset captions. |
| | |
| | 2. `--fid_images_path`: The path to the directory containing subfolders with synthetic |
| | images. In this example, it is set to `/path/to/synthetic_images`. Each subfolder |
| | should contain a set of synthetic images for which you want to compute CLIP scores |
| | against the captions from `--captions_path`. |
| | |
| | 3. `--output_path`: The path to the output CSV file where the CLIP scores will be saved. |
| | In this example, it is set to `/path/to/output/clip_scores.csv`. This file will |
| | contain a table with two columns: `cfg` and `clip_score`. The `cfg` |
| | column lists the names of the subfolders in `--fid_images_path`, and the |
| | `clip_score` column lists the corresponding average CLIP scores between the synthetic |
| | images in each subfolder and the captions from `--captions_path`. |
| | """ |
| |
|
| | import argparse |
| | import csv |
| | import os |
| | from glob import glob |
| |
|
| | import open_clip |
| | import torch |
| | import torch.nn as nn |
| | from PIL import Image |
| | from tqdm import tqdm |
| |
|
| |
|
| | class CLIPEncoder(nn.Module): |
| | def __init__(self, clip_version='ViT-B/32', pretrained='', cache_dir=None, device='cuda'): |
| | super().__init__() |
| |
|
| | self.clip_version = clip_version |
| | if not pretrained: |
| | if self.clip_version == 'ViT-H-14': |
| | self.pretrained = 'laion2b_s32b_b79k' |
| | elif self.clip_version == 'ViT-g-14': |
| | self.pretrained = 'laion2b_s12b_b42k' |
| | else: |
| | self.pretrained = 'openai' |
| |
|
| | self.model, _, self.preprocess = open_clip.create_model_and_transforms( |
| | self.clip_version, pretrained=self.pretrained, cache_dir=cache_dir |
| | ) |
| |
|
| | self.model.eval() |
| | self.model.to(device) |
| |
|
| | self.device = device |
| |
|
| | @torch.no_grad() |
| | def get_clip_score(self, text, image): |
| | if isinstance(image, str): |
| | image = Image.open(image) |
| | if isinstance(image, Image.Image): |
| | image = self.preprocess(image).unsqueeze(0).to(self.device) |
| | image_features = self.model.encode_image(image).float() |
| | image_features /= image_features.norm(dim=-1, keepdim=True) |
| |
|
| | if not isinstance(text, (list, tuple)): |
| | text = [text] |
| | text = open_clip.tokenize(text).to(self.device) |
| | text_features = self.model.encode_text(text).float() |
| | text_features /= text_features.norm(dim=-1, keepdim=True) |
| | similarity = image_features @ text_features.T |
| |
|
| | return similarity |
| |
|
| |
|
| | if __name__ == '__main__': |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument('--captions_path', default='/coco2014/coco2014_val_sampled_30k/captions/', type=str) |
| | parser.add_argument('--fid_images_path', default=None, type=str) |
| | parser.add_argument('--output_path', default='./clip_scores.csv', type=str) |
| | parser.add_argument('--clip_version', default='ViT-L-14', type=str) |
| | args = parser.parse_args() |
| |
|
| | |
| | torch.distributed.init_process_group(backend='nccl') |
| | torch.cuda.set_device(int(os.environ['LOCAL_RANK'])) |
| |
|
| | captions_path = args.captions_path |
| | print('Init CLIP Encoder..') |
| | encoder = CLIPEncoder(clip_version=args.clip_version) |
| |
|
| | |
| | with open(args.output_path, 'w', newline='') as csvfile: |
| | fieldnames = ['cfg', 'clip_score'] |
| | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) |
| | writer.writeheader() |
| |
|
| | |
| | for subfolder in os.listdir(args.fid_images_path): |
| | subfolder_path = os.path.join(args.fid_images_path, subfolder) |
| | if os.path.isdir(subfolder_path): |
| | images = sorted( |
| | glob(f'{subfolder_path}/*.png'), key=lambda x: (int(x.split('/')[-1].strip('.png').strip('image'))) |
| | ) |
| | texts = sorted(glob(f'{captions_path}/*.txt')) |
| | print(images[:5], texts[:5]) |
| | |
| | texts = texts[: len(images)] |
| | assert len(images) == len(texts) |
| | print(f'Number of images text pairs: {len(images)}') |
| |
|
| | imgs = torch.utils.data.DataLoader( |
| | images, sampler=torch.utils.data.distributed.DistributedSampler(images) |
| | ) |
| | txts = torch.utils.data.DataLoader( |
| | texts, sampler=torch.utils.data.distributed.DistributedSampler(texts) |
| | ) |
| |
|
| | ave_sim = torch.tensor(0.0).cuda() |
| | count = 0 |
| | for text, img in zip(tqdm(txts), imgs): |
| | with open(text[0], 'r') as f: |
| | text = f.read().strip() |
| | sim = encoder.get_clip_score(text, img[0]) |
| | ave_sim += sim[0, 0] |
| | count += 1 |
| | if count % 2000 == 0: |
| | print(ave_sim / count) |
| |
|
| | torch.distributed.all_reduce(ave_sim) |
| | ave_sim /= len(images) |
| | print(f'The CLIP similarity for CFG {subfolder}: {ave_sim}') |
| |
|
| | |
| | writer.writerow({'cfg': subfolder, 'clip_score': float(ave_sim)}) |
| |
|