|
|
from transformers import AutoTokenizer |
|
|
from vllm import LLM, SamplingParams |
|
|
import argparse |
|
|
import json |
|
|
from tqdm import tqdm |
|
|
|
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument('--judge', type=str,help='模型路径') |
|
|
parser.add_argument('--model', type=str,help='模型路径') |
|
|
args = parser.parse_args() |
|
|
|
|
|
judgename = args.judge[args.judge.rfind('/')+1:] |
|
|
modelname = args.model[args.model.rfind('/')+1:] |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(args.judge, trust_remote_code=True) |
|
|
|
|
|
|
|
|
llm = LLM(args.judge, dtype='float16', tensor_parallel_size=8, trust_remote_code=True, enforce_eager=True, max_model_len=5400) |
|
|
sampling_params = SamplingParams(temperature=1.0, top_p=0.95, max_tokens=5400) |
|
|
|
|
|
|
|
|
|
|
|
system_prompt = "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user prompt displayed below." |
|
|
f = open(f"/home/aiscuser/fhw/data/{judgename}_filtered_by_answer.json", "r+") |
|
|
ddd = json.loads(f.readlines()[0]) |
|
|
indexes = list(set(ddd[judgename]).intersection(set(ddd[modelname]))) |
|
|
print(len(ddd[judgename])) |
|
|
print(len(ddd[modelname])) |
|
|
print(len(indexes)) |
|
|
f1 = open(f"/home/aiscuser/fhw/data/{judgename}_answerby_{judgename}.json", 'r+') |
|
|
lines1 = f1.readlines() |
|
|
f2 = open(f"/home/aiscuser/fhw/data/{judgename}_answerby_{modelname}.json", 'r+') |
|
|
lines2 = f2.readlines() |
|
|
fw = open(f"/home/aiscuser/fhw/data/{judgename}_judge_{modelname}.json", 'w+') |
|
|
prompts = [] |
|
|
for index in tqdm(indexes): |
|
|
d1 = json.loads(lines1[index]) |
|
|
d2 = json.loads(lines2[index]) |
|
|
instruction = d1["instruction"] |
|
|
reference = d1["response"] |
|
|
response = d2["response"] |
|
|
user_prompt = f"You will be given a user prompt, a reference answer and the assistant's answer. Your job is to compare the assistant's answer with the reference one and assign a score. For each user prompt, carry out the following steps:\n1. Consider if the assistant's answer is helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n2. Then consider the creativity and novelty of the assistant's answer when needed.\n3. Finally, identify any missing important information in the assistants'answer that would be beneficial to include when responding to the user prompt.\n4. After providing your explanation, you must rate the assistant's answer on a scale of 1 to 10, where a higher score reflects higher quality.\nGuidelines for Scoring:\n• Assistant's Answer>>Reference Answer (7-10): The assistant's answer is significantly or slightly better than the reference answer.\n• Assistant's Answer==Reference Answer (5-6): The quality of assistant's answer is relatively the same as that of the reference answer.\n• Assistant's Answer<<Reference Answer (1-4): The assistant's answer is significantly or slightly worse than the reference answer.\n\nUser Prompt:\n{instruction}\n\nReference Answer:\n{reference}\n\nAssistant's Answer:\n{response}\n\nUse double square brackets to format your scores, like so: [[7]].\n" |
|
|
messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}] |
|
|
text = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False |
|
|
) |
|
|
prompts.append(text) |
|
|
outputs = llm.generate(prompts=prompts, sampling_params=sampling_params) |
|
|
for output, index in zip(outputs, indexes): |
|
|
d = json.loads(lines2[index]) |
|
|
d["battle"] = output.outputs[0].text |
|
|
d["index"] = index |
|
|
fw.write(json.dumps(d)+"\n") |
|
|
|