File size: 3,996 Bytes
7ed0fb5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import json
from tqdm import tqdm
import re
from math_verify import parse, verify
MODEL_PATH="<MODEL_PATH>" # qwen2vl model or grpoed model on geoqa train
BSZ=50 # reduce it if GPU OOM
OUTPUT_PATH="<OUTPUT_LOG>"
PROMPT_PATH="./prompts/geoqa_test_prompts.jsonl"
#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = Qwen2VLForConditionalGeneration.from_pretrained(
MODEL_PATH,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
# default processer
processor = AutoProcessor.from_pretrained(MODEL_PATH)
data = []
with open(PROMPT_PATH, "r") as f:
for line in f:
data.append(json.loads(line))
QUESTION_TEMPLATE = "{Question} Output the thinking process in <think> </think> and final answer (number) in <answer> </answer> tags."
messages = []
data = data
for i in data:
message = [{
"role": "user",
"content": [
{
"type": "image",
"image": f"file://{i['image_path']}"
},
{
"type": "text",
"text": QUESTION_TEMPLATE.format(Question=i['question'])
}
]
}]
messages.append(message)
all_outputs = [] # List to store all answers
# Process data in batches
for i in tqdm(range(0, len(messages), BSZ)):
batch_messages = messages[i:i + BSZ]
# Preparation for inference
text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
image_inputs, video_inputs = process_vision_info(batch_messages)
inputs = processor(
text=text,
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
# Inference: Generation of the output
generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=1024, do_sample=False)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
batch_output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
all_outputs.extend(batch_output_text)
print(f"Processed batch {i//BSZ + 1}/{(len(messages) + BSZ - 1)//BSZ}")
final_output = []
correct_number = 0
for input_example, model_output in zip(data,all_outputs):
original_output = model_output
ground_truth = input_example['ground_truth']
model_answer = parse(original_output)
# Count correct answers
if model_answer is not None and float(verify(model_answer,parse(ground_truth)))>0:
correct_number += 1
is_correct = True
else:
is_correct = False
try:
result = {
'question': input_example,
'ground_truth': ground_truth,
'model_output': original_output,
'extracted_answer':str(model_answer[0]) if model_answer is not None else None,
'is_correct':is_correct
}
except Exception as e:
print("no answer parsed",e,model_answer)
result = {
'question': input_example,
'ground_truth': ground_truth,
'model_output': original_output,
'extracted_answer':None,
'is_correct':is_correct
}
final_output.append(result)
# Calculate and print accuracy
accuracy = correct_number / len(data) * 100
print(f"\nAccuracy: {accuracy:.2f}%")
# Save results to a JSON file
output_path = OUTPUT_PATH
with open(output_path, "w") as f:
json.dump({
'accuracy': accuracy,
'results': final_output
}, f, indent=2, ensure_ascii=False)
print(f"Results saved to {output_path}")
|