| import torch
|
| import torch.backends.cudnn as cudnn
|
| from torch.cuda.amp import GradScaler, autocast
|
|
|
| from modified_clip import clip
|
| from models.model import *
|
| from models.prompters import TokenPrompter, NullPrompter, PromptLearner
|
| from attacks import *
|
|
|
| import torch.nn.functional as F
|
| import numpy as np
|
| import torch.nn as nn
|
|
|
|
|
|
|
| def FT_TeCoA_loss(images, target, text_tokens, optimizer, model, original_model,
|
| prompter, add_prompter, prompt_learner, args):
|
| """
|
| TeCoA (PGD-AT) for Fine-tuning
|
| """
|
| criterion = torch.nn.CrossEntropyLoss().cuda()
|
| if args.add_prompt_size == 0:
|
| prompt_token = None
|
| else:
|
| prompt_token = add_prompter()
|
| alpha = args.train_stepsize
|
| attack_iters = args.train_numsteps
|
|
|
|
|
| delta = attack_pgd(prompter, model, add_prompter, criterion, images,
|
| target, text_tokens, alpha, attack_iters, 'l_inf', epsilon=args.train_eps)
|
| adv_img = clip_img_preprocessing(images + delta)
|
| prompted_adv_images = prompter(adv_img)
|
|
|
|
|
|
|
| output_Iadv_Tnat, _ = multiGPU_CLIP(model, prompted_adv_images, text_tokens, prompt_token)
|
| loss_cls = criterion(output_Iadv_Tnat, target)
|
|
|
| loss = loss_cls
|
| return loss, output_Iadv_Tnat
|
|
|
|
|
| def FT_PMG_loss(images, target, text_tokens, optimizer, model, original_model,
|
| prompter, add_prompter, prompt_learner, args):
|
| """
|
| PMG (Aligning adv logits to the original CLIP model) for Fine-tuning
|
| """
|
| criterion = torch.nn.CrossEntropyLoss().cuda()
|
| if args.add_prompt_size == 0:
|
| prompt_token = None
|
| else:
|
| prompt_token = add_prompter()
|
| alpha = args.train_stepsize
|
| attack_iters = args.train_numsteps
|
|
|
|
|
| delta = attack_pgd(prompter, model, add_prompter, criterion, images,
|
| target, text_tokens, alpha, attack_iters, 'l_inf', epsilon=args.train_eps)
|
| adv_img = clip_img_preprocessing(images + delta)
|
| prompted_adv_images = prompter(adv_img)
|
|
|
|
|
|
|
| nat_img = clip_img_preprocessing(images)
|
| prompted_nat_images = prompter(nat_img)
|
| with torch.no_grad():
|
| Ori_output_Inat_Tnat, _ = multiGPU_CLIP(original_model, prompted_nat_images, text_tokens, prompt_token)
|
|
|
|
|
|
|
| output_Iadv_Tnat, _ = multiGPU_CLIP(model, prompted_adv_images, text_tokens, prompt_token)
|
| loss_cls = criterion(output_Iadv_Tnat, target)
|
|
|
|
|
| criterion_KL = nn.KLDivLoss(reduction='batchmean').cuda()
|
| loss_Pred_Align_Ori = criterion_KL(F.log_softmax(output_Iadv_Tnat, dim=1),
|
| F.softmax(Ori_output_Inat_Tnat, dim=1))
|
|
|
|
|
| output_Inat_Tnat, _ = multiGPU_CLIP(model, prompted_nat_images, text_tokens, prompt_token)
|
| loss_Pred_Align = criterion_KL(F.log_softmax(output_Iadv_Tnat, dim=1),
|
| F.softmax(output_Inat_Tnat, dim=1))
|
|
|
| loss = loss_cls + args.W_Pred_Align * loss_Pred_Align + args.W_Pred_Align_Ori * loss_Pred_Align_Ori
|
| return loss, output_Iadv_Tnat
|
|
|
|
|
| def FT_ImgText_PGD_loss(images, target, text_tokens, optimizer, model, original_model,
|
| prompter, add_prompter, prompt_learner, args):
|
| """
|
| PGD-AT + Ori_Pred_Align (Adversarial Imgs and Texts) for Fine-tuning
|
| """
|
| criterion = torch.nn.CrossEntropyLoss().cuda()
|
| if args.add_prompt_size == 0:
|
| prompt_token = None
|
| else:
|
| prompt_token = add_prompter()
|
| alpha = args.train_stepsize
|
| attack_iters = args.train_numsteps
|
|
|
|
|
| nat_img = clip_img_preprocessing(images)
|
| prompted_nat_images = prompter(nat_img)
|
| with torch.no_grad():
|
| Ori_output_Inat_Tnat, _ = multiGPU_CLIP(original_model, prompted_nat_images, text_tokens, prompt_token)
|
|
|
|
|
|
|
| prompt_learner.load_state_dict(args.original_prompter_state)
|
| delta = attack_pgd_adv_prompt(prompter, model, add_prompter, criterion, images,
|
| target, text_tokens, alpha, attack_iters, 'l_inf',
|
| prompt_learner, args.text_perb_stepsize, epsilon=args.train_eps)
|
| adv_img = clip_img_preprocessing(images + delta)
|
| prompted_adv_images = prompter(adv_img)
|
|
|
|
|
| output_Iadv_Tnat, _ = multiGPU_CLIP_Text_Prompt_Tuning(model, prompted_adv_images, text_tokens, prompt_token, prompt_learner)
|
|
|
|
|
| loss_cls = criterion(output_Iadv_Tnat, target)
|
|
|
|
|
| criterion_KL = nn.KLDivLoss(reduction='batchmean').cuda()
|
| loss_Pred_Align_Ori = criterion_KL(F.log_softmax(output_Iadv_Tnat, dim=1),
|
| F.softmax(Ori_output_Inat_Tnat, dim=1))
|
|
|
| loss = loss_cls + args.W_Pred_Align_Ori * loss_Pred_Align_Ori
|
| return loss, output_Iadv_Tnat
|
|
|
|
|
|
|
| def FT_TRADES_loss(images, target, text_tokens, optimizer, model, original_model,
|
| prompter, add_prompter, prompt_learner, args):
|
| """
|
| TRADES (Pred Align to Original CLIP) for Fine-tuning
|
| """
|
| criterion = torch.nn.CrossEntropyLoss().cuda()
|
| if args.add_prompt_size == 0:
|
| prompt_token = None
|
| else:
|
| prompt_token = add_prompter()
|
| alpha = args.train_stepsize
|
| attack_iters = args.train_numsteps
|
|
|
|
|
| nat_img = clip_img_preprocessing(images)
|
| prompted_nat_images = prompter(nat_img)
|
| with torch.no_grad():
|
| Ori_output_Inat_Tnat, _ = multiGPU_CLIP(original_model, prompted_nat_images, text_tokens, prompt_token)
|
|
|
|
|
|
|
| delta = attack_TRADES_KL(prompter, model, add_prompter, criterion, images,
|
| target, text_tokens, alpha, attack_iters, 'l_inf',
|
| Ori_output_Inat_Tnat, epsilon=args.train_eps)
|
| adv_img = clip_img_preprocessing(images + delta)
|
| prompted_adv_images = prompter(adv_img)
|
|
|
|
|
|
|
| if args.mul_noise_beta > 0.0:
|
| output_Iadv_Tnat, _ = multiGPU_CLIP_multiply_noise(model, prompted_adv_images, text_tokens, prompt_token, beta=args.mul_noise_beta)
|
| else:
|
| output_Iadv_Tnat, _ = multiGPU_CLIP(model, prompted_adv_images, text_tokens, prompt_token)
|
|
|
|
|
| output_Inat_Tnat, _ = multiGPU_CLIP(model, prompted_nat_images, text_tokens, prompt_token)
|
| loss_nat_cls = criterion(output_Inat_Tnat, target)
|
|
|
|
|
| criterion_KL = nn.KLDivLoss(reduction='batchmean').cuda()
|
| loss_Pred_Align_Ori = criterion_KL(F.log_softmax(output_Iadv_Tnat, dim=1),
|
| F.softmax(Ori_output_Inat_Tnat, dim=1))
|
|
|
| loss = loss_nat_cls + args.W_Pred_Align_Ori * loss_Pred_Align_Ori
|
| return loss, output_Iadv_Tnat
|
|
|
|
|
| def criterion_L2(out, targets, reduction='mean'):
|
|
|
|
|
|
|
| squared_error_batch = F.mse_loss(out, targets, reduction='none')
|
| squared_error_batch = torch.mean(squared_error_batch.sum(dim=1))
|
| return squared_error_batch
|
|
|
| def FT_FARE_loss(images, target, text_tokens, optimizer, model, original_model,
|
| prompter, add_prompter, prompt_learner, args):
|
| """
|
| FARE (Embedding Alignment (min-max) to Original CLIP) for Fine-tuning
|
| """
|
| criterion = torch.nn.CrossEntropyLoss().cuda()
|
| if args.add_prompt_size == 0:
|
| prompt_token = None
|
| else:
|
| prompt_token = add_prompter()
|
| alpha = args.train_stepsize
|
| attack_iters = args.train_numsteps
|
|
|
|
|
| nat_img = clip_img_preprocessing(images)
|
| prompted_nat_images = prompter(nat_img)
|
| with torch.no_grad():
|
| Ori_output_Inat_Tnat, _, Ori_emb_Inat_Tnat, _ = multiGPU_CLIP(original_model, prompted_nat_images, text_tokens, prompt_token, is_embedding=True)
|
|
|
|
|
|
|
| delta = attack_FARE_Emb_L2(prompter, model, add_prompter, criterion, images,
|
| target, text_tokens, alpha, attack_iters, 'l_inf',
|
| Ori_emb_Inat_Tnat, epsilon=args.train_eps)
|
| adv_img = clip_img_preprocessing(images + delta)
|
| prompted_adv_images = prompter(adv_img)
|
|
|
|
|
|
|
| if args.mul_noise_beta > 0.0:
|
| output_Iadv_Tnat, _ = multiGPU_CLIP_multiply_noise(model, prompted_adv_images, text_tokens, prompt_token, beta=args.mul_noise_beta)
|
| else:
|
| output_Iadv_Tnat, _, emb_Iadv_Tnat, _ = multiGPU_CLIP(model, prompted_adv_images, text_tokens, prompt_token, is_embedding=True)
|
|
|
| loss = criterion_L2(emb_Iadv_Tnat, Ori_emb_Inat_Tnat)
|
|
|
| return loss, output_Iadv_Tnat
|
|
|
| def FT_FARE_loss_weighted(images, target, text_tokens, optimizer, model, original_model,
|
| prompter, add_prompter, prompt_learner, weights, args):
|
| """
|
| FARE (Embedding Alignment with optional reweighting)
|
| """
|
| criterion = torch.nn.CrossEntropyLoss().cuda()
|
| if args.add_prompt_size == 0:
|
| prompt_token = None
|
| else:
|
| prompt_token = add_prompter()
|
|
|
|
|
| nat_img = clip_img_preprocessing(images)
|
| prompted_nat_images = prompter(nat_img)
|
| with torch.no_grad():
|
| Ori_output_Inat_Tnat, _, Ori_emb_Inat_Tnat, _ = multiGPU_CLIP(original_model,
|
| prompted_nat_images,
|
| text_tokens,
|
| prompt_token,
|
| is_embedding=True)
|
|
|
| delta = attack_FARE_Emb_L2(prompter, model, add_prompter, criterion, images,
|
| target, text_tokens, args.train_stepsize, args.train_numsteps,
|
| 'l_inf', Ori_emb_Inat_Tnat, epsilon=args.train_eps)
|
|
|
|
|
| adv_img = clip_img_preprocessing(images + delta)
|
| prompted_adv_images = prompter(adv_img)
|
|
|
|
|
| if args.mul_noise_beta > 0.0:
|
| output_Iadv_Tnat, _ = multiGPU_CLIP_multiply_noise(model, prompted_adv_images,
|
| text_tokens, prompt_token,
|
| beta=args.mul_noise_beta)
|
| _, _, emb_Iadv_Tnat, _ = multiGPU_CLIP(model, prompted_adv_images, text_tokens,
|
| prompt_token, is_embedding=True)
|
| else:
|
| output_Iadv_Tnat, _, emb_Iadv_Tnat, _ = multiGPU_CLIP(model, prompted_adv_images,
|
| text_tokens, prompt_token,
|
| is_embedding=True)
|
| sample_losses = torch.sum((emb_Iadv_Tnat - Ori_emb_Inat_Tnat)**2, dim=1)
|
|
|
| if weights is not None:
|
| if isinstance(weights, np.ndarray):
|
| weights = torch.tensor(weights, dtype=torch.float32)
|
| weights = weights.to(images.device).view(-1)
|
|
|
|
|
| assert weights.shape == sample_losses.shape, f"Shape mismatch: weights {weights.shape}, sample_losses {sample_losses.shape}"
|
|
|
|
|
|
|
| loss = (sample_losses * weights).mean()
|
|
|
|
|
| else:
|
| loss = criterion_L2(emb_Iadv_Tnat, Ori_emb_Inat_Tnat)
|
|
|
| return loss, output_Iadv_Tnat
|
|
|
|
|
|
|