Update evaluation codes
Browse files
evaluation/evaluate_mmvp_MetaCLIP_huge.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import clip
|
| 3 |
+
from clip import load
|
| 4 |
+
import csv
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import torch
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
import json
|
| 9 |
+
from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
|
| 10 |
+
import argparse
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
| 15 |
+
|
| 16 |
+
image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
|
| 17 |
+
csv_file = os.path.join(benchmark_dir, 'Questions.csv')
|
| 18 |
+
|
| 19 |
+
csv_outfile = open('Prediction_Results_MetaCLIP_huge', 'w', newline='')
|
| 20 |
+
csv_writer = csv.writer(csv_outfile)
|
| 21 |
+
csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
|
| 22 |
+
|
| 23 |
+
categories = [
|
| 24 |
+
'Orientation and Direction', 'Presence of Specific Features',
|
| 25 |
+
'State and Condition', 'Quantity and Count',
|
| 26 |
+
'Positional and Relational Context', 'Color and Appearance',
|
| 27 |
+
'Structural Characteristics', 'Texts',
|
| 28 |
+
'Viewpoint and Perspective'
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
pair_accuracies = {category: 0 for category in categories}
|
| 32 |
+
num_pairs = 0
|
| 33 |
+
|
| 34 |
+
with open(csv_file, 'r') as f:
|
| 35 |
+
reader = csv.reader(f)
|
| 36 |
+
next(reader) # skip header
|
| 37 |
+
for i, row in tqdm(enumerate(reader)):
|
| 38 |
+
qid1, qtype1, statement1 = row
|
| 39 |
+
|
| 40 |
+
# Get next row for the pair
|
| 41 |
+
row = next(reader, None)
|
| 42 |
+
if not row:
|
| 43 |
+
break
|
| 44 |
+
qid2, qtype2, statement2 = row
|
| 45 |
+
|
| 46 |
+
qid1, qid2 = int(qid1), int(qid2)
|
| 47 |
+
|
| 48 |
+
img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
|
| 49 |
+
img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
|
| 50 |
+
|
| 51 |
+
text1 = 'a photo of ' + statement1
|
| 52 |
+
text2 = 'a photo of ' + statement2
|
| 53 |
+
|
| 54 |
+
#text1 = clip.tokenize([text1]).to(device)
|
| 55 |
+
#text2 = clip.tokenize([text2]).to(device)
|
| 56 |
+
text1 = tokenizer(
|
| 57 |
+
text1,
|
| 58 |
+
truncation=True,
|
| 59 |
+
max_length=77,
|
| 60 |
+
return_length=False,
|
| 61 |
+
return_overflowing_tokens=False,
|
| 62 |
+
padding="max_length",
|
| 63 |
+
return_tensors="pt",
|
| 64 |
+
)["input_ids"].to(device)
|
| 65 |
+
text2 = tokenizer(
|
| 66 |
+
text2,
|
| 67 |
+
truncation=True,
|
| 68 |
+
max_length=77,
|
| 69 |
+
return_length=False,
|
| 70 |
+
return_overflowing_tokens=False,
|
| 71 |
+
padding="max_length",
|
| 72 |
+
return_tensors="pt",
|
| 73 |
+
)["input_ids"].to(device) # torch.Size([1, 77])
|
| 74 |
+
|
| 75 |
+
#img1 = preprocess(img1).unsqueeze(0).to(device)
|
| 76 |
+
#img2 = preprocess(img2).unsqueeze(0).to(device)
|
| 77 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 78 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 79 |
+
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
| 80 |
+
|
| 81 |
+
with torch.no_grad():
|
| 82 |
+
model.eval().float()
|
| 83 |
+
#logits_per_image1, logits_per_text1 = model(imgs, text1)
|
| 84 |
+
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
| 85 |
+
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 86 |
+
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 87 |
+
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
| 88 |
+
logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
|
| 89 |
+
|
| 90 |
+
probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
|
| 91 |
+
probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
|
| 92 |
+
|
| 93 |
+
img1_score1 = probs1[0][0]
|
| 94 |
+
img1_score2 = probs2[0][0]
|
| 95 |
+
|
| 96 |
+
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
| 97 |
+
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
| 98 |
+
|
| 99 |
+
gt1 = "img1" if qid1 % 2 == 1 else "img2"
|
| 100 |
+
gt2 = "img1" if qid2 % 2 == 1 else "img2"
|
| 101 |
+
|
| 102 |
+
csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
|
| 103 |
+
|
| 104 |
+
current_category = categories[num_pairs // 15]
|
| 105 |
+
if pred1 == gt1 and pred2 == gt2:
|
| 106 |
+
pair_accuracies[current_category] += 1
|
| 107 |
+
num_pairs += 1
|
| 108 |
+
|
| 109 |
+
csv_outfile.close()
|
| 110 |
+
|
| 111 |
+
# Calculate percentage accuracies
|
| 112 |
+
Category_Score_List = []
|
| 113 |
+
|
| 114 |
+
for category in pair_accuracies:
|
| 115 |
+
pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
|
| 116 |
+
Category_Score_List.append(pair_accuracies[category])
|
| 117 |
+
|
| 118 |
+
pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
|
| 119 |
+
|
| 120 |
+
return pair_accuracies
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
|
| 124 |
+
|
| 125 |
+
with torch.no_grad():
|
| 126 |
+
clip_model.eval()
|
| 127 |
+
|
| 128 |
+
results_openai = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
|
| 129 |
+
|
| 130 |
+
# Merge results
|
| 131 |
+
results = {**results_openai}
|
| 132 |
+
|
| 133 |
+
# Convert results to format suitable for star plot
|
| 134 |
+
categories = results[list(results.keys())[0]].keys()
|
| 135 |
+
data = {'Categories': list(categories)}
|
| 136 |
+
for model in list(results_openai.keys()):
|
| 137 |
+
data[model] = [results[model][category] for category in categories]
|
| 138 |
+
|
| 139 |
+
return results
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
if __name__ == "__main__":
|
| 143 |
+
|
| 144 |
+
BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
|
| 145 |
+
|
| 146 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 147 |
+
vision_tower_name = f'MetaCLIP_huge/metaclip-h14-fullcc2.5b-6000'
|
| 148 |
+
|
| 149 |
+
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
| 150 |
+
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
| 151 |
+
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
| 152 |
+
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
| 153 |
+
|
| 154 |
+
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
| 155 |
+
print(results)
|
| 156 |
+
|
| 157 |
+
|
evaluation/evaluate_mmvp_MetaCLIP_large.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import clip
|
| 3 |
+
from clip import load
|
| 4 |
+
import csv
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import torch
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
import json
|
| 9 |
+
from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
|
| 10 |
+
import argparse
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
| 15 |
+
|
| 16 |
+
image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
|
| 17 |
+
csv_file = os.path.join(benchmark_dir, 'Questions.csv')
|
| 18 |
+
|
| 19 |
+
csv_outfile = open('Prediction_Results_MetaCLIP_large', 'w', newline='')
|
| 20 |
+
csv_writer = csv.writer(csv_outfile)
|
| 21 |
+
csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
|
| 22 |
+
|
| 23 |
+
categories = [
|
| 24 |
+
'Orientation and Direction', 'Presence of Specific Features',
|
| 25 |
+
'State and Condition', 'Quantity and Count',
|
| 26 |
+
'Positional and Relational Context', 'Color and Appearance',
|
| 27 |
+
'Structural Characteristics', 'Texts',
|
| 28 |
+
'Viewpoint and Perspective'
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
pair_accuracies = {category: 0 for category in categories}
|
| 32 |
+
num_pairs = 0
|
| 33 |
+
|
| 34 |
+
with open(csv_file, 'r') as f:
|
| 35 |
+
reader = csv.reader(f)
|
| 36 |
+
next(reader) # skip header
|
| 37 |
+
for i, row in tqdm(enumerate(reader)):
|
| 38 |
+
qid1, qtype1, statement1 = row
|
| 39 |
+
|
| 40 |
+
# Get next row for the pair
|
| 41 |
+
row = next(reader, None)
|
| 42 |
+
if not row:
|
| 43 |
+
break
|
| 44 |
+
qid2, qtype2, statement2 = row
|
| 45 |
+
|
| 46 |
+
qid1, qid2 = int(qid1), int(qid2)
|
| 47 |
+
|
| 48 |
+
img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
|
| 49 |
+
img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
|
| 50 |
+
|
| 51 |
+
text1 = 'a photo of ' + statement1
|
| 52 |
+
text2 = 'a photo of ' + statement2
|
| 53 |
+
|
| 54 |
+
#text1 = clip.tokenize([text1]).to(device)
|
| 55 |
+
#text2 = clip.tokenize([text2]).to(device)
|
| 56 |
+
text1 = tokenizer(
|
| 57 |
+
text1,
|
| 58 |
+
truncation=True,
|
| 59 |
+
max_length=77,
|
| 60 |
+
return_length=False,
|
| 61 |
+
return_overflowing_tokens=False,
|
| 62 |
+
padding="max_length",
|
| 63 |
+
return_tensors="pt",
|
| 64 |
+
)["input_ids"].to(device)
|
| 65 |
+
text2 = tokenizer(
|
| 66 |
+
text2,
|
| 67 |
+
truncation=True,
|
| 68 |
+
max_length=77,
|
| 69 |
+
return_length=False,
|
| 70 |
+
return_overflowing_tokens=False,
|
| 71 |
+
padding="max_length",
|
| 72 |
+
return_tensors="pt",
|
| 73 |
+
)["input_ids"].to(device) # torch.Size([1, 77])
|
| 74 |
+
|
| 75 |
+
#img1 = preprocess(img1).unsqueeze(0).to(device)
|
| 76 |
+
#img2 = preprocess(img2).unsqueeze(0).to(device)
|
| 77 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 78 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 79 |
+
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
| 80 |
+
|
| 81 |
+
with torch.no_grad():
|
| 82 |
+
model.eval().float()
|
| 83 |
+
#logits_per_image1, logits_per_text1 = model(imgs, text1)
|
| 84 |
+
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
| 85 |
+
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 86 |
+
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 87 |
+
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
| 88 |
+
logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
|
| 89 |
+
|
| 90 |
+
probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
|
| 91 |
+
probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
|
| 92 |
+
|
| 93 |
+
img1_score1 = probs1[0][0]
|
| 94 |
+
img1_score2 = probs2[0][0]
|
| 95 |
+
|
| 96 |
+
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
| 97 |
+
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
| 98 |
+
|
| 99 |
+
gt1 = "img1" if qid1 % 2 == 1 else "img2"
|
| 100 |
+
gt2 = "img1" if qid2 % 2 == 1 else "img2"
|
| 101 |
+
|
| 102 |
+
csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
|
| 103 |
+
|
| 104 |
+
current_category = categories[num_pairs // 15]
|
| 105 |
+
if pred1 == gt1 and pred2 == gt2:
|
| 106 |
+
pair_accuracies[current_category] += 1
|
| 107 |
+
num_pairs += 1
|
| 108 |
+
|
| 109 |
+
csv_outfile.close()
|
| 110 |
+
|
| 111 |
+
# Calculate percentage accuracies
|
| 112 |
+
Category_Score_List = []
|
| 113 |
+
|
| 114 |
+
for category in pair_accuracies:
|
| 115 |
+
pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
|
| 116 |
+
Category_Score_List.append(pair_accuracies[category])
|
| 117 |
+
|
| 118 |
+
pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
|
| 119 |
+
|
| 120 |
+
return pair_accuracies
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
|
| 124 |
+
|
| 125 |
+
with torch.no_grad():
|
| 126 |
+
clip_model.eval()
|
| 127 |
+
|
| 128 |
+
results_openai = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
|
| 129 |
+
|
| 130 |
+
# Merge results
|
| 131 |
+
results = {**results_openai}
|
| 132 |
+
|
| 133 |
+
# Convert results to format suitable for star plot
|
| 134 |
+
categories = results[list(results.keys())[0]].keys()
|
| 135 |
+
data = {'Categories': list(categories)}
|
| 136 |
+
for model in list(results_openai.keys()):
|
| 137 |
+
data[model] = [results[model][category] for category in categories]
|
| 138 |
+
|
| 139 |
+
return results
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
if __name__ == "__main__":
|
| 143 |
+
|
| 144 |
+
BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
|
| 145 |
+
|
| 146 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 147 |
+
vision_tower_name = 'MetaCLIP_large/metaclip-l14-fullcc2.5b-7000'
|
| 148 |
+
|
| 149 |
+
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
| 150 |
+
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
| 151 |
+
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
| 152 |
+
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
| 153 |
+
|
| 154 |
+
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
| 155 |
+
print(results)
|
| 156 |
+
|
| 157 |
+
|
evaluation/evaluate_mmvp_OpenAICLIP_224.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import clip
|
| 3 |
+
from clip import load
|
| 4 |
+
import csv
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import torch
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
import json
|
| 9 |
+
from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
|
| 10 |
+
import argparse
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
| 15 |
+
|
| 16 |
+
image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
|
| 17 |
+
csv_file = os.path.join(benchmark_dir, 'Questions.csv')
|
| 18 |
+
|
| 19 |
+
csv_outfile = open('Prediction_Results_OpenAICLIP', 'w', newline='')
|
| 20 |
+
csv_writer = csv.writer(csv_outfile)
|
| 21 |
+
csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
|
| 22 |
+
|
| 23 |
+
categories = [
|
| 24 |
+
'Orientation and Direction', 'Presence of Specific Features',
|
| 25 |
+
'State and Condition', 'Quantity and Count',
|
| 26 |
+
'Positional and Relational Context', 'Color and Appearance',
|
| 27 |
+
'Structural Characteristics', 'Texts',
|
| 28 |
+
'Viewpoint and Perspective'
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
pair_accuracies = {category: 0 for category in categories}
|
| 32 |
+
num_pairs = 0
|
| 33 |
+
|
| 34 |
+
with open(csv_file, 'r') as f:
|
| 35 |
+
reader = csv.reader(f)
|
| 36 |
+
next(reader) # skip header
|
| 37 |
+
for i, row in tqdm(enumerate(reader)):
|
| 38 |
+
qid1, qtype1, statement1 = row
|
| 39 |
+
|
| 40 |
+
# Get next row for the pair
|
| 41 |
+
row = next(reader, None)
|
| 42 |
+
if not row:
|
| 43 |
+
break
|
| 44 |
+
qid2, qtype2, statement2 = row
|
| 45 |
+
|
| 46 |
+
qid1, qid2 = int(qid1), int(qid2)
|
| 47 |
+
|
| 48 |
+
img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
|
| 49 |
+
img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
|
| 50 |
+
|
| 51 |
+
text1 = 'a photo of ' + statement1
|
| 52 |
+
text2 = 'a photo of ' + statement2
|
| 53 |
+
|
| 54 |
+
#text1 = clip.tokenize([text1]).to(device)
|
| 55 |
+
#text2 = clip.tokenize([text2]).to(device)
|
| 56 |
+
text1 = tokenizer(
|
| 57 |
+
text1,
|
| 58 |
+
truncation=True,
|
| 59 |
+
max_length=77,
|
| 60 |
+
return_length=False,
|
| 61 |
+
return_overflowing_tokens=False,
|
| 62 |
+
padding="max_length",
|
| 63 |
+
return_tensors="pt",
|
| 64 |
+
)["input_ids"].to(device)
|
| 65 |
+
text2 = tokenizer(
|
| 66 |
+
text2,
|
| 67 |
+
truncation=True,
|
| 68 |
+
max_length=77,
|
| 69 |
+
return_length=False,
|
| 70 |
+
return_overflowing_tokens=False,
|
| 71 |
+
padding="max_length",
|
| 72 |
+
return_tensors="pt",
|
| 73 |
+
)["input_ids"].to(device) # torch.Size([1, 77])
|
| 74 |
+
|
| 75 |
+
#img1 = preprocess(img1).unsqueeze(0).to(device)
|
| 76 |
+
#img2 = preprocess(img2).unsqueeze(0).to(device)
|
| 77 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 78 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 79 |
+
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
| 80 |
+
|
| 81 |
+
with torch.no_grad():
|
| 82 |
+
model.eval().float()
|
| 83 |
+
#logits_per_image1, logits_per_text1 = model(imgs, text1)
|
| 84 |
+
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
| 85 |
+
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 86 |
+
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 87 |
+
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
| 88 |
+
logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
|
| 89 |
+
|
| 90 |
+
probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
|
| 91 |
+
probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
|
| 92 |
+
|
| 93 |
+
img1_score1 = probs1[0][0]
|
| 94 |
+
img1_score2 = probs2[0][0]
|
| 95 |
+
|
| 96 |
+
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
| 97 |
+
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
| 98 |
+
|
| 99 |
+
gt1 = "img1" if qid1 % 2 == 1 else "img2"
|
| 100 |
+
gt2 = "img1" if qid2 % 2 == 1 else "img2"
|
| 101 |
+
|
| 102 |
+
csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
|
| 103 |
+
|
| 104 |
+
current_category = categories[num_pairs // 15]
|
| 105 |
+
if pred1 == gt1 and pred2 == gt2:
|
| 106 |
+
pair_accuracies[current_category] += 1
|
| 107 |
+
num_pairs += 1
|
| 108 |
+
|
| 109 |
+
csv_outfile.close()
|
| 110 |
+
|
| 111 |
+
# Calculate percentage accuracies
|
| 112 |
+
Category_Score_List = []
|
| 113 |
+
|
| 114 |
+
for category in pair_accuracies:
|
| 115 |
+
pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
|
| 116 |
+
Category_Score_List.append(pair_accuracies[category])
|
| 117 |
+
|
| 118 |
+
pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
|
| 119 |
+
|
| 120 |
+
return pair_accuracies
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
|
| 124 |
+
|
| 125 |
+
with torch.no_grad():
|
| 126 |
+
clip_model.eval()
|
| 127 |
+
|
| 128 |
+
results_openai = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
|
| 129 |
+
|
| 130 |
+
# Merge results
|
| 131 |
+
results = {**results_openai}
|
| 132 |
+
|
| 133 |
+
# Convert results to format suitable for star plot
|
| 134 |
+
categories = results[list(results.keys())[0]].keys()
|
| 135 |
+
data = {'Categories': list(categories)}
|
| 136 |
+
for model in list(results_openai.keys()):
|
| 137 |
+
data[model] = [results[model][category] for category in categories]
|
| 138 |
+
|
| 139 |
+
return results
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
if __name__ == "__main__":
|
| 143 |
+
|
| 144 |
+
BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
|
| 145 |
+
|
| 146 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 147 |
+
vision_tower_name = f'OpenAICLIP_224/clip-vit-large-patch14-all-lr5-3000-res384'
|
| 148 |
+
|
| 149 |
+
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
| 150 |
+
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
| 151 |
+
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
| 152 |
+
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
| 153 |
+
|
| 154 |
+
#vision_tower.to(torch.float32)
|
| 155 |
+
# print(next(model.parameters()).device) # cuda:0
|
| 156 |
+
|
| 157 |
+
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
| 158 |
+
print(results)
|
| 159 |
+
|
evaluation/evaluate_mmvp_OpenAICLIP_336.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import clip
|
| 3 |
+
from clip import load
|
| 4 |
+
import csv
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import torch
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
import json
|
| 9 |
+
from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
|
| 10 |
+
import argparse
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
| 15 |
+
|
| 16 |
+
image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
|
| 17 |
+
csv_file = os.path.join(benchmark_dir, 'Questions.csv')
|
| 18 |
+
|
| 19 |
+
csv_outfile = open('Prediction_Results_OpenAICLIP', 'w', newline='')
|
| 20 |
+
csv_writer = csv.writer(csv_outfile)
|
| 21 |
+
csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
|
| 22 |
+
|
| 23 |
+
categories = [
|
| 24 |
+
'Orientation and Direction', 'Presence of Specific Features',
|
| 25 |
+
'State and Condition', 'Quantity and Count',
|
| 26 |
+
'Positional and Relational Context', 'Color and Appearance',
|
| 27 |
+
'Structural Characteristics', 'Texts',
|
| 28 |
+
'Viewpoint and Perspective'
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
pair_accuracies = {category: 0 for category in categories}
|
| 32 |
+
num_pairs = 0
|
| 33 |
+
|
| 34 |
+
with open(csv_file, 'r') as f:
|
| 35 |
+
reader = csv.reader(f)
|
| 36 |
+
next(reader) # skip header
|
| 37 |
+
for i, row in tqdm(enumerate(reader)):
|
| 38 |
+
qid1, qtype1, statement1 = row
|
| 39 |
+
|
| 40 |
+
# Get next row for the pair
|
| 41 |
+
row = next(reader, None)
|
| 42 |
+
if not row:
|
| 43 |
+
break
|
| 44 |
+
qid2, qtype2, statement2 = row
|
| 45 |
+
|
| 46 |
+
qid1, qid2 = int(qid1), int(qid2)
|
| 47 |
+
|
| 48 |
+
img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
|
| 49 |
+
img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
|
| 50 |
+
|
| 51 |
+
text1 = 'a photo of ' + statement1
|
| 52 |
+
text2 = 'a photo of ' + statement2
|
| 53 |
+
|
| 54 |
+
#text1 = clip.tokenize([text1]).to(device)
|
| 55 |
+
#text2 = clip.tokenize([text2]).to(device)
|
| 56 |
+
text1 = tokenizer(
|
| 57 |
+
text1,
|
| 58 |
+
truncation=True,
|
| 59 |
+
max_length=77,
|
| 60 |
+
return_length=False,
|
| 61 |
+
return_overflowing_tokens=False,
|
| 62 |
+
padding="max_length",
|
| 63 |
+
return_tensors="pt",
|
| 64 |
+
)["input_ids"].to(device)
|
| 65 |
+
text2 = tokenizer(
|
| 66 |
+
text2,
|
| 67 |
+
truncation=True,
|
| 68 |
+
max_length=77,
|
| 69 |
+
return_length=False,
|
| 70 |
+
return_overflowing_tokens=False,
|
| 71 |
+
padding="max_length",
|
| 72 |
+
return_tensors="pt",
|
| 73 |
+
)["input_ids"].to(device) # torch.Size([1, 77])
|
| 74 |
+
|
| 75 |
+
#img1 = preprocess(img1).unsqueeze(0).to(device)
|
| 76 |
+
#img2 = preprocess(img2).unsqueeze(0).to(device)
|
| 77 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 78 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 79 |
+
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
| 80 |
+
|
| 81 |
+
with torch.no_grad():
|
| 82 |
+
model.eval().float()
|
| 83 |
+
#logits_per_image1, logits_per_text1 = model(imgs, text1)
|
| 84 |
+
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
| 85 |
+
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 86 |
+
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 87 |
+
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
| 88 |
+
logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
|
| 89 |
+
|
| 90 |
+
probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
|
| 91 |
+
probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
|
| 92 |
+
|
| 93 |
+
img1_score1 = probs1[0][0]
|
| 94 |
+
img1_score2 = probs2[0][0]
|
| 95 |
+
|
| 96 |
+
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
| 97 |
+
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
| 98 |
+
|
| 99 |
+
gt1 = "img1" if qid1 % 2 == 1 else "img2"
|
| 100 |
+
gt2 = "img1" if qid2 % 2 == 1 else "img2"
|
| 101 |
+
|
| 102 |
+
csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
|
| 103 |
+
|
| 104 |
+
current_category = categories[num_pairs // 15]
|
| 105 |
+
if pred1 == gt1 and pred2 == gt2:
|
| 106 |
+
pair_accuracies[current_category] += 1
|
| 107 |
+
num_pairs += 1
|
| 108 |
+
|
| 109 |
+
csv_outfile.close()
|
| 110 |
+
|
| 111 |
+
# Calculate percentage accuracies
|
| 112 |
+
Category_Score_List = []
|
| 113 |
+
|
| 114 |
+
for category in pair_accuracies:
|
| 115 |
+
pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
|
| 116 |
+
Category_Score_List.append(pair_accuracies[category])
|
| 117 |
+
|
| 118 |
+
pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
|
| 119 |
+
|
| 120 |
+
return pair_accuracies
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
|
| 124 |
+
|
| 125 |
+
with torch.no_grad():
|
| 126 |
+
clip_model.eval()
|
| 127 |
+
|
| 128 |
+
results_openai = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
|
| 129 |
+
|
| 130 |
+
# Merge results
|
| 131 |
+
results = {**results_openai}
|
| 132 |
+
|
| 133 |
+
# Convert results to format suitable for star plot
|
| 134 |
+
categories = results[list(results.keys())[0]].keys()
|
| 135 |
+
data = {'Categories': list(categories)}
|
| 136 |
+
for model in list(results_openai.keys()):
|
| 137 |
+
data[model] = [results[model][category] for category in categories]
|
| 138 |
+
|
| 139 |
+
return results
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
if __name__ == "__main__":
|
| 143 |
+
|
| 144 |
+
BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
|
| 145 |
+
|
| 146 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 147 |
+
vision_tower_name = f'OpenAICLIP_336/clip-vit-large-patch14-336-all-lr5-3500-512-tokens'
|
| 148 |
+
|
| 149 |
+
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
| 150 |
+
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
| 151 |
+
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
| 152 |
+
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
| 153 |
+
|
| 154 |
+
#vision_tower.to(torch.float32)
|
| 155 |
+
# print(next(model.parameters()).device) # cuda:0
|
| 156 |
+
|
| 157 |
+
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
| 158 |
+
print(results)
|
| 159 |
+
|
evaluation/evaluate_mmvp_SigLIP_224.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import clip
|
| 3 |
+
from clip import load
|
| 4 |
+
import csv
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import torch
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
import json
|
| 9 |
+
from transformers import SiglipProcessor, SiglipModel, SiglipImageProcessor, SiglipTokenizer
|
| 10 |
+
import argparse
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
| 15 |
+
|
| 16 |
+
image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
|
| 17 |
+
csv_file = os.path.join(benchmark_dir, 'Questions.csv')
|
| 18 |
+
|
| 19 |
+
csv_outfile = open('Prediction_Results_SigLIP_224', 'w', newline='')
|
| 20 |
+
csv_writer = csv.writer(csv_outfile)
|
| 21 |
+
csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
|
| 22 |
+
|
| 23 |
+
categories = [
|
| 24 |
+
'Orientation and Direction', 'Presence of Specific Features',
|
| 25 |
+
'State and Condition', 'Quantity and Count',
|
| 26 |
+
'Positional and Relational Context', 'Color and Appearance',
|
| 27 |
+
'Structural Characteristics', 'Texts',
|
| 28 |
+
'Viewpoint and Perspective'
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
pair_accuracies = {category: 0 for category in categories}
|
| 32 |
+
num_pairs = 0
|
| 33 |
+
|
| 34 |
+
with open(csv_file, 'r') as f:
|
| 35 |
+
reader = csv.reader(f)
|
| 36 |
+
next(reader) # skip header
|
| 37 |
+
for i, row in tqdm(enumerate(reader)):
|
| 38 |
+
qid1, qtype1, statement1 = row
|
| 39 |
+
|
| 40 |
+
# Get next row for the pair
|
| 41 |
+
row = next(reader, None)
|
| 42 |
+
if not row:
|
| 43 |
+
break
|
| 44 |
+
qid2, qtype2, statement2 = row
|
| 45 |
+
|
| 46 |
+
qid1, qid2 = int(qid1), int(qid2)
|
| 47 |
+
|
| 48 |
+
img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
|
| 49 |
+
img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
|
| 50 |
+
|
| 51 |
+
text1 = 'a photo of ' + statement1
|
| 52 |
+
text2 = 'a photo of ' + statement2
|
| 53 |
+
|
| 54 |
+
#text1 = clip.tokenize([text1]).to(device)
|
| 55 |
+
#text2 = clip.tokenize([text2]).to(device)
|
| 56 |
+
text1 = tokenizer(
|
| 57 |
+
text1,
|
| 58 |
+
truncation=True,
|
| 59 |
+
return_length=False,
|
| 60 |
+
return_overflowing_tokens=False,
|
| 61 |
+
padding="max_length",
|
| 62 |
+
return_tensors="pt",
|
| 63 |
+
)["input_ids"].to(device)
|
| 64 |
+
text2 = tokenizer(
|
| 65 |
+
text2,
|
| 66 |
+
truncation=True,
|
| 67 |
+
return_length=False,
|
| 68 |
+
return_overflowing_tokens=False,
|
| 69 |
+
padding="max_length",
|
| 70 |
+
return_tensors="pt",
|
| 71 |
+
)["input_ids"].to(device) # torch.Size([1, 77])
|
| 72 |
+
|
| 73 |
+
#img1 = preprocess(img1).unsqueeze(0).to(device)
|
| 74 |
+
#img2 = preprocess(img2).unsqueeze(0).to(device)
|
| 75 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 76 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 77 |
+
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
| 78 |
+
|
| 79 |
+
with torch.no_grad():
|
| 80 |
+
model.eval().float()
|
| 81 |
+
#logits_per_image1, logits_per_text1 = model(imgs, text1)
|
| 82 |
+
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
| 83 |
+
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 84 |
+
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 85 |
+
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
| 86 |
+
logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
|
| 87 |
+
|
| 88 |
+
probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
|
| 89 |
+
probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
|
| 90 |
+
|
| 91 |
+
img1_score1 = probs1[0][0]
|
| 92 |
+
img1_score2 = probs2[0][0]
|
| 93 |
+
|
| 94 |
+
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
| 95 |
+
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
| 96 |
+
|
| 97 |
+
gt1 = "img1" if qid1 % 2 == 1 else "img2"
|
| 98 |
+
gt2 = "img1" if qid2 % 2 == 1 else "img2"
|
| 99 |
+
|
| 100 |
+
csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
|
| 101 |
+
|
| 102 |
+
current_category = categories[num_pairs // 15]
|
| 103 |
+
if pred1 == gt1 and pred2 == gt2:
|
| 104 |
+
pair_accuracies[current_category] += 1
|
| 105 |
+
num_pairs += 1
|
| 106 |
+
|
| 107 |
+
csv_outfile.close()
|
| 108 |
+
|
| 109 |
+
# Calculate percentage accuracies
|
| 110 |
+
Category_Score_List = []
|
| 111 |
+
|
| 112 |
+
for category in pair_accuracies:
|
| 113 |
+
pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
|
| 114 |
+
Category_Score_List.append(pair_accuracies[category])
|
| 115 |
+
|
| 116 |
+
pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
|
| 117 |
+
|
| 118 |
+
return pair_accuracies
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
|
| 122 |
+
|
| 123 |
+
with torch.no_grad():
|
| 124 |
+
clip_model.eval()
|
| 125 |
+
|
| 126 |
+
results_siglip = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
|
| 127 |
+
|
| 128 |
+
# Merge results
|
| 129 |
+
results = {**results_siglip}
|
| 130 |
+
|
| 131 |
+
# Convert results to format suitable for star plot
|
| 132 |
+
categories = results[list(results.keys())[0]].keys()
|
| 133 |
+
data = {'Categories': list(categories)}
|
| 134 |
+
for model in list(results_siglip.keys()):
|
| 135 |
+
data[model] = [results[model][category] for category in categories]
|
| 136 |
+
|
| 137 |
+
return results
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
if __name__ == "__main__":
|
| 141 |
+
|
| 142 |
+
BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
|
| 143 |
+
|
| 144 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 145 |
+
vision_tower_name = f'SigLIP_224/siglip-so400m-patch14-224-9000'
|
| 146 |
+
|
| 147 |
+
vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
|
| 148 |
+
image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
|
| 149 |
+
tokenizer = SiglipTokenizer.from_pretrained(vision_tower_name)
|
| 150 |
+
|
| 151 |
+
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
| 152 |
+
print(results)
|
evaluation/evaluate_mmvp_SigLIP_384.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import clip
|
| 3 |
+
from clip import load
|
| 4 |
+
import csv
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import torch
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
import json
|
| 9 |
+
from transformers import SiglipProcessor, SiglipModel, SiglipImageProcessor, SiglipTokenizer
|
| 10 |
+
import argparse
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
| 15 |
+
|
| 16 |
+
image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
|
| 17 |
+
csv_file = os.path.join(benchmark_dir, 'Questions.csv')
|
| 18 |
+
|
| 19 |
+
csv_outfile = open('Prediction_Results_SigLIP_384', 'w', newline='')
|
| 20 |
+
csv_writer = csv.writer(csv_outfile)
|
| 21 |
+
csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
|
| 22 |
+
|
| 23 |
+
categories = [
|
| 24 |
+
'Orientation and Direction', 'Presence of Specific Features',
|
| 25 |
+
'State and Condition', 'Quantity and Count',
|
| 26 |
+
'Positional and Relational Context', 'Color and Appearance',
|
| 27 |
+
'Structural Characteristics', 'Texts',
|
| 28 |
+
'Viewpoint and Perspective'
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
pair_accuracies = {category: 0 for category in categories}
|
| 32 |
+
num_pairs = 0
|
| 33 |
+
|
| 34 |
+
with open(csv_file, 'r') as f:
|
| 35 |
+
reader = csv.reader(f)
|
| 36 |
+
next(reader) # skip header
|
| 37 |
+
for i, row in tqdm(enumerate(reader)):
|
| 38 |
+
qid1, qtype1, statement1 = row
|
| 39 |
+
|
| 40 |
+
# Get next row for the pair
|
| 41 |
+
row = next(reader, None)
|
| 42 |
+
if not row:
|
| 43 |
+
break
|
| 44 |
+
qid2, qtype2, statement2 = row
|
| 45 |
+
|
| 46 |
+
qid1, qid2 = int(qid1), int(qid2)
|
| 47 |
+
|
| 48 |
+
img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
|
| 49 |
+
img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
|
| 50 |
+
|
| 51 |
+
text1 = 'a photo of ' + statement1
|
| 52 |
+
text2 = 'a photo of ' + statement2
|
| 53 |
+
|
| 54 |
+
#text1 = clip.tokenize([text1]).to(device)
|
| 55 |
+
#text2 = clip.tokenize([text2]).to(device)
|
| 56 |
+
text1 = tokenizer(
|
| 57 |
+
text1,
|
| 58 |
+
truncation=True,
|
| 59 |
+
return_length=False,
|
| 60 |
+
return_overflowing_tokens=False,
|
| 61 |
+
padding="max_length",
|
| 62 |
+
return_tensors="pt",
|
| 63 |
+
)["input_ids"].to(device)
|
| 64 |
+
text2 = tokenizer(
|
| 65 |
+
text2,
|
| 66 |
+
truncation=True,
|
| 67 |
+
return_length=False,
|
| 68 |
+
return_overflowing_tokens=False,
|
| 69 |
+
padding="max_length",
|
| 70 |
+
return_tensors="pt",
|
| 71 |
+
)["input_ids"].to(device) # torch.Size([1, 77])
|
| 72 |
+
|
| 73 |
+
#img1 = preprocess(img1).unsqueeze(0).to(device)
|
| 74 |
+
#img2 = preprocess(img2).unsqueeze(0).to(device)
|
| 75 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 76 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 77 |
+
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
| 78 |
+
|
| 79 |
+
with torch.no_grad():
|
| 80 |
+
model.eval().float()
|
| 81 |
+
#logits_per_image1, logits_per_text1 = model(imgs, text1)
|
| 82 |
+
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
| 83 |
+
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 84 |
+
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 85 |
+
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
| 86 |
+
logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
|
| 87 |
+
|
| 88 |
+
probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
|
| 89 |
+
probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
|
| 90 |
+
|
| 91 |
+
img1_score1 = probs1[0][0]
|
| 92 |
+
img1_score2 = probs2[0][0]
|
| 93 |
+
|
| 94 |
+
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
| 95 |
+
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
| 96 |
+
|
| 97 |
+
gt1 = "img1" if qid1 % 2 == 1 else "img2"
|
| 98 |
+
gt2 = "img1" if qid2 % 2 == 1 else "img2"
|
| 99 |
+
|
| 100 |
+
csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
|
| 101 |
+
|
| 102 |
+
current_category = categories[num_pairs // 15]
|
| 103 |
+
if pred1 == gt1 and pred2 == gt2:
|
| 104 |
+
pair_accuracies[current_category] += 1
|
| 105 |
+
num_pairs += 1
|
| 106 |
+
|
| 107 |
+
csv_outfile.close()
|
| 108 |
+
|
| 109 |
+
# Calculate percentage accuracies
|
| 110 |
+
Category_Score_List = []
|
| 111 |
+
|
| 112 |
+
for category in pair_accuracies:
|
| 113 |
+
pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
|
| 114 |
+
Category_Score_List.append(pair_accuracies[category])
|
| 115 |
+
|
| 116 |
+
pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
|
| 117 |
+
|
| 118 |
+
return pair_accuracies
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
|
| 122 |
+
|
| 123 |
+
with torch.no_grad():
|
| 124 |
+
clip_model.eval()
|
| 125 |
+
|
| 126 |
+
results_siglip = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
|
| 127 |
+
|
| 128 |
+
# Merge results
|
| 129 |
+
results = {**results_siglip}
|
| 130 |
+
|
| 131 |
+
# Convert results to format suitable for star plot
|
| 132 |
+
categories = results[list(results.keys())[0]].keys()
|
| 133 |
+
data = {'Categories': list(categories)}
|
| 134 |
+
for model in list(results_siglip.keys()):
|
| 135 |
+
data[model] = [results[model][category] for category in categories]
|
| 136 |
+
|
| 137 |
+
return results
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
if __name__ == "__main__":
|
| 141 |
+
|
| 142 |
+
BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
|
| 143 |
+
|
| 144 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 145 |
+
vision_tower_name = f'SigLIP_384/siglip-so400m-patch14-384-7500'
|
| 146 |
+
|
| 147 |
+
vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
|
| 148 |
+
image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
|
| 149 |
+
tokenizer = SiglipTokenizer.from_pretrained(vision_tower_name)
|
| 150 |
+
|
| 151 |
+
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
| 152 |
+
print(results)
|