Spaces:
Sleeping
Sleeping
| from time import process_time_ns | |
| import torch | |
| import clip | |
| from PIL import Image | |
| import os | |
| import json | |
| import numpy as np | |
| from pathlib import Path | |
| import matplotlib.pyplot as plt | |
| from sklearn.metrics import confusion_matrix | |
| import itertools | |
| import torch.nn.functional as F | |
| import torch.nn as nn | |
| from torch.utils.data import random_split | |
| from torch import nn | |
| from torchvision import transforms | |
| import sys | |
| import argparse | |
| import time | |
| from tqdm import tqdm | |
| from sklearn import metrics | |
| from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_curve | |
| from blipmodels import blip_decoder | |
| class NeuralNet(nn.Module): | |
| def __init__(self, input_size, hidden_size_list, num_classes): | |
| super(NeuralNet, self).__init__() | |
| self.dropout2 = nn.Dropout(0.5) | |
| self.fc1 = nn.Linear(input_size, hidden_size_list[0]) | |
| self.fc2 = nn.Linear(hidden_size_list[0], hidden_size_list[1]) | |
| self.fc3 = nn.Linear(hidden_size_list[1], num_classes) | |
| def forward(self, x): | |
| out = self.fc1(x) | |
| out = F.relu(out) | |
| out = self.dropout2(out) | |
| out = self.fc2(out) | |
| out = F.relu(out) | |
| out = self.fc3(out) | |
| return out | |
| def preprocess_image(img_path, image_size=224): | |
| img = Image.open(img_path) | |
| img = img.resize((image_size, image_size)) | |
| return preprocess(img) | |
| parser = argparse.ArgumentParser(description='Finetune the classifier to wash the backdoor') | |
| parser.add_argument('--image_path',default='CLIP.png',type=str) | |
| parser.add_argument('--gpu', default='0', type=str) | |
| args = parser.parse_args() | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model2, preprocess = clip.load("ViT-B/32") | |
| image_size = 224 | |
| blip_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth' | |
| blip = blip_decoder(pretrained=blip_url, image_size=image_size, vit='base') | |
| blip.eval() | |
| blip = blip.to(device) | |
| img = Image.open(args.image_path).convert('RGB') | |
| tform = transforms.Compose( | |
| [ | |
| transforms.Resize(224), | |
| transforms.CenterCrop(224), | |
| transforms.ToTensor(), | |
| ] | |
| ) | |
| img = tform(img) | |
| img = img.unsqueeze(0).to("cuda") | |
| caption = blip.generate(img, sample=False, num_beams=3, max_length=60, min_length=5) | |
| text = clip.tokenize(list(caption)).to(device) | |
| model = torch.load("finetune_clip.pt").to(device) | |
| linear = NeuralNet(1024,[512,256],2).to(device) | |
| linear = torch.load('clip_linear.pt') | |
| image = preprocess_image(args.image_path,image_size).unsqueeze(0).to(device) | |
| with torch.no_grad(): | |
| image_features = model.encode_image(image) | |
| text_features = model.encode_text(text) | |
| emb = torch.cat((image_features, text_features),1) | |
| output = linear(emb.float()) | |
| predict = output.argmax(1) | |
| predict = predict.cpu().numpy() | |
| print(predict) | |