import torch from utils import one_hot_embedding from models.model import * import torch.nn.functional as F import functools from autoattack import AutoAttack lower_limit, upper_limit = 0, 1 def clamp(X, lower_limit, upper_limit): return torch.max(torch.min(X, upper_limit), lower_limit) def attack_pgd_p(prompter, model, add_prompter, criterion, X, target, text_tokens, alpha, attack_iters, norm, restarts=1, early_stop=True, epsilon=0, attack_prob=1.0): # Initialize perturbation delta = torch.zeros_like(X).cuda() if norm == "l_inf": delta.uniform_(-epsilon, epsilon) elif norm == "l_2": delta.normal_() d_flat = delta.view(delta.size(0), -1) n = d_flat.norm(p=2, dim=1).view(delta.size(0), 1, 1, 1) r = torch.zeros_like(n).uniform_(0, 1) delta *= r / n * epsilon else: raise ValueError("Unsupported norm") # Clamp initial delta delta = clamp(delta, lower_limit - X.detach(), upper_limit - X.detach()) delta.requires_grad = True # Generate a mask for probabilistic attack application, with probability `attack_prob` mask = torch.bernoulli(torch.full((X.size(0),), attack_prob)).to(device=X.device) mask = mask.view(-1, 1, 1, 1) # Reshape to broadcast over image dimensions for _ in range(attack_iters): _images = clip_img_preprocessing(X + delta) # Apply prompter if provided if prompter is not None: prompted_images = prompter(_images) else: prompted_images = _images # Obtain additional prompt token if necessary prompt_token = add_prompter() if add_prompter is not None else None # Get model output output, _ = multiGPU_CLIP(model, prompted_images, text_tokens, prompt_token) # Compute loss and gradients loss = criterion(output, target) loss.backward() grad = delta.grad.detach() # Update perturbation d = delta g = grad x = X if norm == "l_inf": d = torch.clamp(d + alpha * torch.sign(g), min=-epsilon, max=epsilon) elif norm == "l_2": g_norm = torch.norm(g.view(g.shape[0], -1), dim=1).view(-1, 1, 1, 1) scaled_g = g / (g_norm + 1e-10) d = (d + scaled_g * alpha).view(d.size(0), -1).renorm(p=2, dim=0, maxnorm=epsilon).view_as(d) # Conditionally update delta based on mask delta.data = torch.where(mask.bool(), clamp(d, lower_limit - x, upper_limit - x), torch.zeros_like(d)) # Zero the gradients for the next iteration delta.grad.zero_() return delta def attack_pgd(prompter, model, add_prompter, criterion, X, target, text_tokens, alpha, attack_iters, norm, restarts=1, early_stop=True, epsilon=0): delta = torch.zeros_like(X).cuda() # detach needs to be deleted after motivation if norm == "l_inf": delta.uniform_(-epsilon, epsilon) elif norm == "l_2": delta.normal_() d_flat = delta.view(delta.size(0), -1) n = d_flat.norm(p=2, dim=1).view(delta.size(0), 1, 1, 1) r = torch.zeros_like(n).uniform_(0, 1) delta *= r / n * epsilon else: raise ValueError delta = clamp(delta, lower_limit - X.detach(), upper_limit - X.detach()) delta.requires_grad = True for _ in range(attack_iters): _images = clip_img_preprocessing(X + delta) if prompter is not None: prompted_images = prompter(_images) else: prompted_images = _images prompt_token = add_prompter() if add_prompter is not None else None output, _ = multiGPU_CLIP(model, prompted_images, text_tokens, prompt_token) loss = criterion(output, target) loss.backward() grad = delta.grad.detach() d = delta[:, :, :, :] g = grad[:, :, :, :] x = X[:, :, :, :] if norm == "l_inf": d = torch.clamp(d + alpha * torch.sign(g), min=-epsilon, max=epsilon) elif norm == "l_2": g_norm = torch.norm(g.view(g.shape[0], -1), dim=1).view(-1, 1, 1, 1) scaled_g = g / (g_norm + 1e-10) d = (d + scaled_g * alpha).view(d.size(0), -1).renorm(p=2, dim=0, maxnorm=epsilon).view_as(d) d = clamp(d, lower_limit - x, upper_limit - x) delta.data[:, :, :, :] = d delta.grad.zero_() return delta def attack_pgd_motivation(prompter, model, add_prompter, criterion, X, target, text_tokens, alpha, attack_iters, norm, restarts=1, early_stop=True, epsilon=0): delta = torch.zeros_like(X.detach()).cuda() # detach needs to be deleted for motivation if norm == "l_inf": delta.uniform_(-epsilon, epsilon) elif norm == "l_2": delta.normal_() d_flat = delta.view(delta.size(0), -1) n = d_flat.norm(p=2, dim=1).view(delta.size(0), 1, 1, 1) r = torch.zeros_like(n).uniform_(0, 1) delta *= r / n * epsilon else: raise ValueError delta = clamp(delta, lower_limit - X.detach(), upper_limit - X.detach()) delta.requires_grad = True for _ in range(attack_iters): _images = clip_img_preprocessing(X + delta) if prompter is not None: prompted_images = prompter(_images) else: prompted_images = _images prompt_token = add_prompter() if add_prompter is not None else None output, _ = multiGPU_CLIP(model, prompted_images, text_tokens, prompt_token) loss = criterion(output, target) loss.backward() grad = delta.grad.detach() d = delta[:, :, :, :] g = grad[:, :, :, :] x = X[:, :, :, :] if norm == "l_inf": d = torch.clamp(d + alpha * torch.sign(g), min=-epsilon, max=epsilon) elif norm == "l_2": g_norm = torch.norm(g.view(g.shape[0], -1), dim=1).view(-1, 1, 1, 1) scaled_g = g / (g_norm + 1e-10) d = (d + scaled_g * alpha).view(d.size(0), -1).renorm(p=2, dim=0, maxnorm=epsilon).view_as(d) d = clamp(d, lower_limit - x, upper_limit - x) delta.data[:, :, :, :] = d delta.grad.zero_() return delta def high_curv_point(prompter, model, add_prompter, criterion, X, target, text_tokens, alpha, attack_iters, norm, restarts=1, early_stop=True, epsilon=0): delta = torch.zeros_like(X.detach()).cuda() delta = clamp(delta, lower_limit - X.detach(), upper_limit - X.detach()) delta.requires_grad = True for _ in range(attack_iters): _images = clip_img_preprocessing(X + delta) if prompter is not None: prompted_images = prompter(_images) else: prompted_images = _images prompt_token = add_prompter() if add_prompter is not None else None output, _ = multiGPU_CLIP(model, prompted_images, text_tokens, prompt_token) loss = criterion(output, target) loss.backward() grad = delta.grad.detach() d = delta[:, :, :, :] g = grad[:, :, :, :] x = X[:, :, :, :] if norm == "l_inf": d = torch.clamp(d + alpha * torch.sign(g), min=-epsilon, max=epsilon) elif norm == "l_2": g_norm = torch.norm(g.view(g.shape[0], -1), dim=1).view(-1, 1, 1, 1) scaled_g = g / (g_norm + 1e-10) d = (d + scaled_g * alpha).view(d.size(0), -1).renorm(p=2, dim=0, maxnorm=epsilon).view_as(d) d = clamp(d, lower_limit - x, upper_limit - x) delta.data[:, :, :, :] = d delta.grad.zero_() return delta def attack_pgd_nuc(prompter, model, add_prompter, criterion, X, target, text_tokens, alpha, attack_iters, norm, ori_nat_logits, W_CE, W_reg, restarts=1, early_stop=True, epsilon=0): delta = torch.zeros_like(X).cuda() if norm == "l_inf": delta.uniform_(-epsilon, epsilon) elif norm == "l_2": delta.normal_() d_flat = delta.view(delta.size(0), -1) n = d_flat.norm(p=2, dim=1).view(delta.size(0), 1, 1, 1) r = torch.zeros_like(n).uniform_(0, 1) delta *= r / n * epsilon else: raise ValueError delta = clamp(delta, lower_limit - X, upper_limit - X) delta.requires_grad = True for _ in range(attack_iters): _images = clip_img_preprocessing(X + delta) if prompter is not None: prompted_images = prompter(_images) else: prompted_images = _images prompt_token = add_prompter() if add_prompter is not None else None output, _ = multiGPU_CLIP(model, prompted_images, text_tokens, prompt_token) loss = W_CE * criterion(output, target) + W_reg * torch.norm(ori_nat_logits - output, 'nuc')/_images.size(0) loss.backward() grad = delta.grad.detach() d = delta[:, :, :, :] g = grad[:, :, :, :] x = X[:, :, :, :] if norm == "l_inf": d = torch.clamp(d + alpha * torch.sign(g), min=-epsilon, max=epsilon) elif norm == "l_2": g_norm = torch.norm(g.view(g.shape[0], -1), dim=1).view(-1, 1, 1, 1) scaled_g = g / (g_norm + 1e-10) d = (d + scaled_g * alpha).view(d.size(0), -1).renorm(p=2, dim=0, maxnorm=epsilon).view_as(d) d = clamp(d, lower_limit - x, upper_limit - x) delta.data[:, :, :, :] = d delta.grad.zero_() return delta def attack_TRADES_KL(prompter, model, add_prompter, criterion, X, target, text_tokens, alpha, attack_iters, norm, ori_nat_logits, restarts=1, early_stop=True, epsilon=0): criterion_KL = torch.nn.KLDivLoss(reduction='batchmean').cuda() delta = torch.zeros_like(X).cuda() if norm == "l_inf": delta.uniform_(-epsilon, epsilon) elif norm == "l_2": delta.normal_() d_flat = delta.view(delta.size(0), -1) n = d_flat.norm(p=2, dim=1).view(delta.size(0), 1, 1, 1) r = torch.zeros_like(n).uniform_(0, 1) delta *= r / n * epsilon else: raise ValueError delta = clamp(delta, lower_limit - X, upper_limit - X) delta.requires_grad = True for _ in range(attack_iters): _images = clip_img_preprocessing(X + delta) if prompter is not None: prompted_images = prompter(_images) else: prompted_images = _images prompt_token = add_prompter() if add_prompter is not None else None output, _ = multiGPU_CLIP(model, prompted_images, text_tokens, prompt_token) loss = criterion_KL(F.log_softmax(output, dim=1), F.softmax(ori_nat_logits, dim=1)) loss.backward() grad = delta.grad.detach() d = delta[:, :, :, :] g = grad[:, :, :, :] x = X[:, :, :, :] if norm == "l_inf": d = torch.clamp(d + alpha * torch.sign(g), min=-epsilon, max=epsilon) elif norm == "l_2": g_norm = torch.norm(g.view(g.shape[0], -1), dim=1).view(-1, 1, 1, 1) scaled_g = g / (g_norm + 1e-10) d = (d + scaled_g * alpha).view(d.size(0), -1).renorm(p=2, dim=0, maxnorm=epsilon).view_as(d) d = clamp(d, lower_limit - x, upper_limit - x) delta.data[:, :, :, :] = d delta.grad.zero_() return delta def criterion_L2(out, targets, reduction='mean'): # squared l2 - it does not divide by the latent dimension # should have shape (batch_size, embedding_size) # Compute the element-wise squared error squared_error_batch = F.mse_loss(out, targets, reduction='none') squared_error_batch = torch.mean(squared_error_batch.sum(dim=1)) return squared_error_batch def attack_FARE_Emb_L2(prompter, model, add_prompter, criterion, X, target, text_tokens, alpha, attack_iters, norm, ori_nat_emb, restarts=1, early_stop=True, epsilon=0): # criterion_L2 = torch.nn.MSELoss().cuda() delta = torch.zeros_like(X).cuda() if norm == "l_inf": delta.uniform_(-epsilon, epsilon) elif norm == "l_2": delta.normal_() d_flat = delta.view(delta.size(0), -1) n = d_flat.norm(p=2, dim=1).view(delta.size(0), 1, 1, 1) r = torch.zeros_like(n).uniform_(0, 1) delta *= r / n * epsilon else: raise ValueError delta = clamp(delta, lower_limit - X, upper_limit - X) delta.requires_grad = True for _ in range(attack_iters): _images = clip_img_preprocessing(X + delta) if prompter is not None: prompted_images = prompter(_images) else: prompted_images = _images prompt_token = add_prompter() if add_prompter is not None else None output, _, output_emb, _ = multiGPU_CLIP(model, prompted_images, text_tokens, prompt_token, is_embedding=True) loss = criterion_L2(output_emb, ori_nat_emb) loss.backward() grad = delta.grad.detach() d = delta[:, :, :, :] g = grad[:, :, :, :] x = X[:, :, :, :] if norm == "l_inf": d = torch.clamp(d + alpha * torch.sign(g), min=-epsilon, max=epsilon) elif norm == "l_2": g_norm = torch.norm(g.view(g.shape[0], -1), dim=1).view(-1, 1, 1, 1) scaled_g = g / (g_norm + 1e-10) d = (d + scaled_g * alpha).view(d.size(0), -1).renorm(p=2, dim=0, maxnorm=epsilon).view_as(d) d = clamp(d, lower_limit - x, upper_limit - x) delta.data[:, :, :, :] = d delta.grad.zero_() return delta def attack_CW(prompter, model, add_prompter, criterion, X, target, text_tokens, alpha, attack_iters, norm, restarts=1, early_stop=True, epsilon=0): delta = torch.zeros_like(X).cuda() if norm == "l_inf": delta.uniform_(-epsilon, epsilon) elif norm == "l_2": delta.normal_() d_flat = delta.view(delta.size(0), -1) n = d_flat.norm(p=2, dim=1).view(delta.size(0), 1, 1, 1) r = torch.zeros_like(n).uniform_(0, 1) delta *= r / n * epsilon else: raise ValueError delta = clamp(delta, lower_limit - X, upper_limit - X) delta.requires_grad = True for _ in range(attack_iters): _images = clip_img_preprocessing(X + delta) if prompter is not None: prompted_images = prompter(_images) else: prompted_images = _images prompt_token = add_prompter() if add_prompter is not None else None output, _ = multiGPU_CLIP(model, prompted_images, text_tokens, prompt_token) num_class = output.size(1) label_mask = one_hot_embedding(target, num_class) label_mask = label_mask.cuda() correct_logit = torch.sum(label_mask*output, dim=1) wrong_logit, _ = torch.max((1-label_mask)*output - 1e4*label_mask, axis=1) # loss = criterion(output, target) loss = - torch.sum(F.relu(correct_logit - wrong_logit + 50)) loss.backward() grad = delta.grad.detach() d = delta[:, :, :, :] g = grad[:, :, :, :] x = X[:, :, :, :] if norm == "l_inf": d = torch.clamp(d + alpha * torch.sign(g), min=-epsilon, max=epsilon) elif norm == "l_2": g_norm = torch.norm(g.view(g.shape[0], -1), dim=1).view(-1, 1, 1, 1) scaled_g = g / (g_norm + 1e-10) d = (d + scaled_g * alpha).view(d.size(0), -1).renorm(p=2, dim=0, maxnorm=epsilon).view_as(d) d = clamp(d, lower_limit - x, upper_limit - x) delta.data[:, :, :, :] = d delta.grad.zero_() return delta def attack_auto(model, images, target, text_tokens, prompter, add_prompter, attacks_to_run=['apgd-ce', 'apgd-dlr'], epsilon=0): forward_pass = functools.partial( multiGPU_CLIP_image_logits, model=model, text_tokens=text_tokens, prompter=prompter, add_prompter=add_prompter ) adversary = AutoAttack(forward_pass, norm='Linf', eps=epsilon, version='standard', verbose=False) adversary.attacks_to_run = attacks_to_run x_adv = adversary.run_standard_evaluation(images, target, bs=images.shape[0]) return x_adv def attack_auto_new(model, images, target, text_tokens, prompter, add_prompter, attacks_to_run=['apgd-ce', 'apgd-dlr'], epsilon=0): def model_fn(x): if prompter is not None: prompted_images = prompter(clip_img_preprocessing(x)) else: prompted_images = clip_img_preprocessing(x) prompt_token = add_prompter() if add_prompter is not None else None output_a, _ = multiGPU_CLIP(model, prompted_images, text_tokens, prompt_token) # print("img_shape", prompted_images.shape, "text_shape", text_tokens.shape, "output_shape", output_a.shape) return output_a.to(torch.float32) adversary = AutoAttack(model_fn, norm='Linf', eps=epsilon, version='standard', verbose=False) adversary.attacks_to_run = attacks_to_run x_adv = adversary.run_standard_evaluation(images, target, bs=images.shape[0]) return x_adv # --------------------------------------------------------------------------------------------------------------------------------------------- def attack_pgd_adv_prompt(prompter, model, add_prompter, criterion, X, target, text_tokens, alpha, attack_iters, norm, prompt_learner, text_perb_stepsize, restarts=1, early_stop=True, epsilon=0): delta = torch.zeros_like(X).cuda() if norm == "l_inf": delta.uniform_(-epsilon, epsilon) elif norm == "l_2": delta.normal_() d_flat = delta.view(delta.size(0), -1) n = d_flat.norm(p=2, dim=1).view(delta.size(0), 1, 1, 1) r = torch.zeros_like(n).uniform_(0, 1) delta *= r / n * epsilon else: raise ValueError delta = clamp(delta, lower_limit - X, upper_limit - X) delta.requires_grad = True ################## text prompt optimizer ################## prompter_optim = torch.optim.SGD(prompt_learner.parameters(), lr=text_perb_stepsize, momentum=0, weight_decay=0) # prompter_state = copy.deepcopy(prompt_learner.state_dict()) ################## text prompt optimizer ################## ### Simulate the adversarial token embedding ### # prompt_output = prompt_learner() # print("Prompt learner output stats:") # print(f"Mean: {prompt_output.mean().item()}, StdDev: {prompt_output.std().item()}") # print(prompt_learner().shape) ### Simulate the adversarial token embedding ### for _ in range(attack_iters): _images = clip_img_preprocessing(X + delta) if prompter is not None: prompted_images = prompter(_images) else: prompted_images = _images prompt_token = add_prompter() if add_prompter is not None else None output, _ = multiGPU_CLIP_Text_Prompt_Tuning(model, prompted_images, text_tokens, prompt_token, prompt_learner) loss = criterion(output, target) prompter_optim.zero_grad() loss.backward() grad = delta.grad.detach() d = delta[:, :, :, :] g = grad[:, :, :, :] x = X[:, :, :, :] if norm == "l_inf": d = torch.clamp(d + alpha * torch.sign(g), min=-epsilon, max=epsilon) elif norm == "l_2": g_norm = torch.norm(g.view(g.shape[0], -1), dim=1).view(-1, 1, 1, 1) scaled_g = g / (g_norm + 1e-10) d = (d + scaled_g * alpha).view(d.size(0), -1).renorm(p=2, dim=0, maxnorm=epsilon).view_as(d) d = clamp(d, lower_limit - x, upper_limit - x) delta.data[:, :, :, :] = d delta.grad.zero_() for param in prompt_learner.parameters(): if param.grad is not None: # Gradient sign reverse param.grad.data = -1.0 * param.grad.data prompter_optim.step() ### Simulate the adversarial token embedding ### # prompt_output = prompt_learner() # print("Prompt learner output stats:") # print(f"Mean: {prompt_output.mean().item()}, StdDev: {prompt_output.std().item()}") # print(prompt_learner().shape) # print("loss", loss) ### Simulate the adversarial token embedding ### ## Reset Prompt Learner # prompt_learner.load_state_dict(prompter_state) prompter_optim.zero_grad() return delta # PGD only disrupts the text branch def attack_pgd_adv_promptONLY(prompter, model, add_prompter, criterion, X, target, text_tokens, alpha, attack_iters, norm, prompt_learner, text_perb_stepsize, restarts=1, early_stop=True, epsilon=0): delta = torch.zeros_like(X).cuda() ################## text prompt optimizer ################## prompter_optim = torch.optim.SGD(prompt_learner.parameters(), lr=text_perb_stepsize, momentum=0, weight_decay=0) # prompter_state = copy.deepcopy(prompt_learner.state_dict()) ################## text prompt optimizer ################## ### Simulate the adversarial token embedding ### # prompt_output = prompt_learner() # print("Prompt learner output stats:") # print(f"Mean: {prompt_output.mean().item()}, StdDev: {prompt_output.std().item()}") # print(prompt_learner().shape) ### Simulate the adversarial token embedding ### for _ in range(attack_iters): _images = clip_img_preprocessing(X) if prompter is not None: prompted_images = prompter(_images) else: prompted_images = _images prompt_token = add_prompter() if add_prompter is not None else None output, _ = multiGPU_CLIP_Text_Prompt_Tuning(model, prompted_images, text_tokens, prompt_token, prompt_learner) loss = criterion(output, target) prompter_optim.zero_grad() loss.backward() for param in prompt_learner.parameters(): if param.grad is not None: # Gradient sign reverse param.grad.data = -1.0 * param.grad.data prompter_optim.step() ### Simulate the adversarial token embedding ### # prompt_output = prompt_learner() # print("Prompt learner output stats:") # print(f"Mean: {prompt_output.mean().item()}, StdDev: {prompt_output.std().item()}") # print(prompt_learner().shape) # print("loss", loss) ### Simulate the adversarial token embedding ### ## Reset Prompt Learner # prompt_learner.load_state_dict(prompter_state) prompter_optim.zero_grad() return delta