File size: 4,385 Bytes
e1aaaac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import sys
import open_clip
import torch
import torch.nn.functional as F
import numpy as np
from torchvision.transforms import transforms

from open_flamingo.eval.classification_utils import IMAGENET_1K_CLASS_ID_TO_LABEL



def print_statistics(arr):
    # make sure its 1-d
    assert len(arr.shape) == 1
    print(f'[mean] {arr.mean():.4f} [median] {np.median(arr):.4f} [min] {arr.min():.4f} [max] '
          f'{arr.max():.4f} [std] {arr.std():.4f} [n] {len(arr)}\n')

def interpolate_state_dict(m1, beta=0.):
    m = {}
    try:
        m2 = torch.load("/mnt/nsingh/project_multimodal/models/clip-vit-l-visual.pt", map_location='cpu')
    except:  
        m2 = torch.load("/data/naman_deep_singh/project_multimodal/clip-vit-l-visual.pt", map_location='cpu')
    for k in m1.keys():
        m[k] = (1 - beta) * m1[k] +  beta * m2[k]

    return m


def load_clip_model(clip_model_name, pretrained, beta=0.):
    try:  # try loading only visual model
        model, _, image_processor = open_clip.create_model_and_transforms(
            clip_model_name, pretrained='openai', device='cpu'
        )
        if pretrained != 'openai':
            if isinstance(pretrained, str):
                checkpoint = torch.load(pretrained, map_location=torch.device('cpu'))
            else:
                checkpoint = pretrained
            # if beta non-zero interpolate between clean and pretrained ckpts
            if beta != 0.:
                print("beta", beta)
                checkpoint = interpolate_state_dict(pretrained, beta)

            if 'vision_encoder_state_dict' in checkpoint.keys():  # tecoa checkpoint
                model.visual.load_state_dict(checkpoint['vision_encoder_state_dict'])
            else:
                model.visual.load_state_dict(checkpoint)
    except RuntimeError as e:  # try loading whole model
        print(f'error: {e}', file=sys.stderr)
        print('retrying by loading whole model..', file=sys.stderr)
        torch.cuda.empty_cache()
        model, _, image_processor = open_clip.create_model_and_transforms(
            clip_model_name, pretrained=pretrained, force_quick_gelu=True, device='cpu'
        )
    model.eval()

    # Remove the Normalize transform by creating a new Compose object
    preprocessor_no_norm = transforms.Compose(image_processor.transforms[:-1])
    normalizer = image_processor.transforms[-1]
    return model, preprocessor_no_norm, normalizer

@torch.no_grad()
def get_text_embeddings(model, dataset, texts):
    assert not (dataset and texts)
    if dataset:
        assert dataset == 'imagenet'
    if dataset == 'imagenet':
        template = 'This is a photo of a {}'
        texts = [template.format(c) for c in IMAGENET_1K_CLASS_ID_TO_LABEL.values()]
        text_tokens = open_clip.tokenize(texts)
    elif texts:
        text_tokens = open_clip.tokenize(texts)
    embedding_text_labels_norm = []
    chunk_size = 500
    for i in range(0, len(text_tokens), chunk_size):
        el = text_tokens[i:i+chunk_size]
        embedding_text_labels_norm.append(
            model.model.encode_text(el.cuda(), normalize=True).detach().cpu()
        )
    embedding_text_labels_norm = torch.cat(embedding_text_labels_norm).T
    if dataset == 'imagenet':
        assert (embedding_text_labels_norm.shape == (512, 1000)
                or embedding_text_labels_norm.shape == (768, 1000)), embedding_text_labels_norm.shape
    return embedding_text_labels_norm


@torch.inference_mode()
def compute_accuracy_no_dataloader(model, data, targets, device, batch_size=1000):
    # data, targets: tensors
    # (in parts copied from autoattack)
    train_flag = model.training
    model.eval()
    n_batches = int(np.ceil(data.shape[0] / batch_size))
    n_total = 0
    n_correct = 0
    for batch_idx in range(n_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, data.shape[0])
        data_batch = data[start_idx:end_idx, :].clone().to(device)
        targets_batch = targets[start_idx:end_idx].clone().to(device)
        logits = model(data_batch)
        confs, preds = F.softmax(logits, dim=1).max(dim=1)
        n_total += targets_batch.size(0)
        n_correct += (preds.eq(targets_batch).sum()).item()
    acc = n_correct / n_total

    # print(f'{n_total=}')
    # print(f'{n_correct=}')
    if train_flag:
        model.train()
    return acc