viscot-demo / llava /eval /eval_cot_score.py
dung-vpt-uney
Deploy Visual-CoT demo with Zero GPU support
b90b5f6
import os
import openai
import time
import numpy as np
from tqdm import tqdm
import json
import argparse
import re
import requests
import threading
BASE_PROMPT = """
You are responsible for proofreading the answers, you need to give a score to the model's answer by referring to the standard answer, based on the given question. The full score is 1 point and the minimum score is 0 points. Please output the score in the form "score: <score>". The evaluation criteria require that the closer the model's answer is to the standard answer, the higher the score.
"""
PROMPT = """
question: %s
standard answer: %s
model's answer: %s
"""
API_KEY = ''
def make_request_openai(content, extra_args={}):
headers = {}
headers['Content-Type']='application/json'
retry_times = 3
while retry_times > 0:
try:
data = {}
data['model']= "gpt-3.5-turbo-1106"
data['messages'] = [{"role":"system","content": BASE_PROMPT}, {"role": "user", "content":content}]
for key in extra_args:
data[key] = extra_args[key]
headers['Authorization'] = API_KEY
r = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=data, timeout=60)
response = r.json()
response = response['choices'][0]['message']['content']
return response
except Exception as e:
print(e)
time.sleep(1)
finally:
retry_times -= 1
return 'unknown'
def get_score(question_text, gt_answer_text, pred_answer_text):
content = PROMPT % (question_text, gt_answer_text, pred_answer_text)
ret = make_request_openai(content)
ret = ret.lower()
if 'score' not in ret:
return 0.0
res = re.findall(r'score: ([\d\.]+)', ret)
if len(res) != 1:
return 0.0
res = float(res[0])
if res > 1.0:
res = 1
if res < 0.0:
res = 0
return res
def process(i):
pred_answer = pred_answers[i]
question_id = pred_answer['question_id']
question = questions[question_id]
if 'Please provide the bounding box' in question['conversations'][0]['value']:
question_text = question['conversations'][0]['value'].split(' Please provide the bounding box')[0]
else:
question_text = question['conversations'][0]['value']
gt_answer_text = question['conversations'][-1]['value']
pred_answer_text = pred_answer['text']
score = get_score(question_text, gt_answer_text, pred_answer_text)
scores.append(score)
results.append({'question_id': question_id, 'question_text': question_text, 'gt_answer_text': gt_answer_text, 'pred_answer_text': pred_answer_text, 'score': score})
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--question-file", type=str)
parser.add_argument("--result-file", type=str)
parser.add_argument('--output-result', type=str)
args = parser.parse_args()
scores = []
results = []
if args.question_file.endswith('.jsonl'):
questions = [json.loads(line) for line in open(args.question_file)]
questions = {question['question_id']: question for question in questions}
else:
questions = json.load(open(args.question_file))
questions = {question['question_id']: question for question in questions}
pred_answers = [json.loads(q) for q in open(args.result_file)]
last_num = 0
for i in tqdm(range(len(pred_answers))):
process(i)
print('The avg score is: %f' % np.mean(scores))
with open(args.output_result, 'w') as f:
json.dump(results, f, indent=2)