Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,657 Bytes
b90b5f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import os
import openai
import time
import numpy as np
from tqdm import tqdm
import json
import argparse
import re
import requests
import threading
BASE_PROMPT = """
You are responsible for proofreading the answers, you need to give a score to the model's answer by referring to the standard answer, based on the given question. The full score is 1 point and the minimum score is 0 points. Please output the score in the form "score: <score>". The evaluation criteria require that the closer the model's answer is to the standard answer, the higher the score.
"""
PROMPT = """
question: %s
standard answer: %s
model's answer: %s
"""
API_KEY = ''
def make_request_openai(content, extra_args={}):
headers = {}
headers['Content-Type']='application/json'
retry_times = 3
while retry_times > 0:
try:
data = {}
data['model']= "gpt-3.5-turbo-1106"
data['messages'] = [{"role":"system","content": BASE_PROMPT}, {"role": "user", "content":content}]
for key in extra_args:
data[key] = extra_args[key]
headers['Authorization'] = API_KEY
r = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=data, timeout=60)
response = r.json()
response = response['choices'][0]['message']['content']
return response
except Exception as e:
print(e)
time.sleep(1)
finally:
retry_times -= 1
return 'unknown'
def get_score(question_text, gt_answer_text, pred_answer_text):
content = PROMPT % (question_text, gt_answer_text, pred_answer_text)
ret = make_request_openai(content)
ret = ret.lower()
if 'score' not in ret:
return 0.0
res = re.findall(r'score: ([\d\.]+)', ret)
if len(res) != 1:
return 0.0
res = float(res[0])
if res > 1.0:
res = 1
if res < 0.0:
res = 0
return res
def process(i):
pred_answer = pred_answers[i]
question_id = pred_answer['question_id']
question = questions[question_id]
if 'Please provide the bounding box' in question['conversations'][0]['value']:
question_text = question['conversations'][0]['value'].split(' Please provide the bounding box')[0]
else:
question_text = question['conversations'][0]['value']
gt_answer_text = question['conversations'][-1]['value']
pred_answer_text = pred_answer['text']
score = get_score(question_text, gt_answer_text, pred_answer_text)
scores.append(score)
results.append({'question_id': question_id, 'question_text': question_text, 'gt_answer_text': gt_answer_text, 'pred_answer_text': pred_answer_text, 'score': score})
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--question-file", type=str)
parser.add_argument("--result-file", type=str)
parser.add_argument('--output-result', type=str)
args = parser.parse_args()
scores = []
results = []
if args.question_file.endswith('.jsonl'):
questions = [json.loads(line) for line in open(args.question_file)]
questions = {question['question_id']: question for question in questions}
else:
questions = json.load(open(args.question_file))
questions = {question['question_id']: question for question in questions}
pred_answers = [json.loads(q) for q in open(args.result_file)]
last_num = 0
for i in tqdm(range(len(pred_answers))):
process(i)
print('The avg score is: %f' % np.mean(scores))
with open(args.output_result, 'w') as f:
json.dump(results, f, indent=2)
|