Spaces:

tuandunghcmut
/

viscot-demo

Running on Zero

viscot-demo / llava /eval /eval_cot_score.py

dung-vpt-uney

Deploy Visual-CoT demo with Zero GPU support

b90b5f6 2 months ago

3.66 kB

	import os
	import openai
	import time
	import numpy as np
	from tqdm import tqdm
	import json
	import argparse
	import re
	import requests
	import threading


	BASE_PROMPT = """
	You are responsible for proofreading the answers, you need to give a score to the model's answer by referring to the standard answer, based on the given question. The full score is 1 point and the minimum score is 0 points. Please output the score in the form "score: <score>". The evaluation criteria require that the closer the model's answer is to the standard answer, the higher the score.
	"""

	PROMPT = """
	question: %s
	standard answer: %s
	model's answer: %s
	"""

	API_KEY = ''

	def make_request_openai(content, extra_args={}):
	headers = {}
	headers['Content-Type']='application/json'
	retry_times = 3
	while retry_times > 0:
	try:
	data = {}
	data['model']= "gpt-3.5-turbo-1106"
	data['messages'] = [{"role":"system","content": BASE_PROMPT}, {"role": "user", "content":content}]
	for key in extra_args:
	data[key] = extra_args[key]
	headers['Authorization'] = API_KEY
	r = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=data, timeout=60)
	response = r.json()
	response = response['choices'][0]['message']['content']
	return response
	except Exception as e:
	print(e)
	time.sleep(1)
	finally:
	retry_times -= 1
	return 'unknown'


	def get_score(question_text, gt_answer_text, pred_answer_text):
	content = PROMPT % (question_text, gt_answer_text, pred_answer_text)
	ret = make_request_openai(content)
	ret = ret.lower()
	if 'score' not in ret:
	return 0.0
	res = re.findall(r'score: ([\d\.]+)', ret)
	if len(res) != 1:
	return 0.0
	res = float(res[0])
	if res > 1.0:
	res = 1
	if res < 0.0:
	res = 0
	return res


	def process(i):
	pred_answer = pred_answers[i]
	question_id = pred_answer['question_id']
	question = questions[question_id]

	if 'Please provide the bounding box' in question['conversations'][0]['value']:
	question_text = question['conversations'][0]['value'].split(' Please provide the bounding box')[0]
	else:
	question_text = question['conversations'][0]['value']
	gt_answer_text = question['conversations'][-1]['value']
	pred_answer_text = pred_answer['text']
	score = get_score(question_text, gt_answer_text, pred_answer_text)
	scores.append(score)
	results.append({'question_id': question_id, 'question_text': question_text, 'gt_answer_text': gt_answer_text, 'pred_answer_text': pred_answer_text, 'score': score})


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--question-file", type=str)
	parser.add_argument("--result-file", type=str)
	parser.add_argument('--output-result', type=str)
	args = parser.parse_args()

	scores = []
	results = []

	if args.question_file.endswith('.jsonl'):
	questions = [json.loads(line) for line in open(args.question_file)]
	questions = {question['question_id']: question for question in questions}
	else:
	questions = json.load(open(args.question_file))
	questions = {question['question_id']: question for question in questions}
	pred_answers = [json.loads(q) for q in open(args.result_file)]

	last_num = 0
	for i in tqdm(range(len(pred_answers))):
	process(i)

	print('The avg score is: %f' % np.mean(scores))
	with open(args.output_result, 'w') as f:
	json.dump(results, f, indent=2)