codescripts / vllmbattle.py

f541119578

Upload folder using huggingface_hub

fdf190d verified about 1 year ago

3.74 kB

	from transformers import AutoTokenizer
	from vllm import LLM, SamplingParams
	import argparse
	import json
	from tqdm import tqdm

	parser = argparse.ArgumentParser()
	parser.add_argument('--judge', type=str,help='模型路径')
	parser.add_argument('--model', type=str,help='模型路径')
	args = parser.parse_args()

	judgename = args.judge[args.judge.rfind('/')+1:]
	modelname = args.model[args.model.rfind('/')+1:]

	# Initialize the tokenizer
	tokenizer = AutoTokenizer.from_pretrained(args.judge, trust_remote_code=True)

	# Input the model name or path. Can be GPTQ or AWQ models.
	llm = LLM(args.judge, dtype='float16', tensor_parallel_size=8, trust_remote_code=True, enforce_eager=True, max_model_len=5400)
	sampling_params = SamplingParams(temperature=1.0, top_p=0.95, max_tokens=5400)


	# Prepare your prompts
	system_prompt = "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user prompt displayed below."
	f = open(f"/home/aiscuser/fhw/data/{judgename}_filtered_by_answer.json", "r+")
	ddd = json.loads(f.readlines()[0])
	indexes = list(set(ddd[judgename]).intersection(set(ddd[modelname])))
	print(len(ddd[judgename]))
	print(len(ddd[modelname]))
	print(len(indexes))
	f1 = open(f"/home/aiscuser/fhw/data/{judgename}_answerby_{judgename}.json", 'r+')
	lines1 = f1.readlines()
	f2 = open(f"/home/aiscuser/fhw/data/{judgename}_answerby_{modelname}.json", 'r+')
	lines2 = f2.readlines()
	fw = open(f"/home/aiscuser/fhw/data/{judgename}_judge_{modelname}.json", 'w+')
	prompts = []
	for index in tqdm(indexes):
	d1 = json.loads(lines1[index])
	d2 = json.loads(lines2[index])
	instruction = d1["instruction"]
	reference = d1["response"]
	response = d2["response"]
	user_prompt = f"You will be given a user prompt, a reference answer and the assistant's answer. Your job is to compare the assistant's answer with the reference one and assign a score. For each user prompt, carry out the following steps:\n1. Consider if the assistant's answer is helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n2. Then consider the creativity and novelty of the assistant's answer when needed.\n3. Finally, identify any missing important information in the assistants'answer that would be beneficial to include when responding to the user prompt.\n4. After providing your explanation, you must rate the assistant's answer on a scale of 1 to 10, where a higher score reflects higher quality.\nGuidelines for Scoring:\n• Assistant's Answer>>Reference Answer (7-10): The assistant's answer is significantly or slightly better than the reference answer.\n• Assistant's Answer==Reference Answer (5-6): The quality of assistant's answer is relatively the same as that of the reference answer.\n• Assistant's Answer<<Reference Answer (1-4): The assistant's answer is significantly or slightly worse than the reference answer.\n\nUser Prompt:\n{instruction}\n\nReference Answer:\n{reference}\n\nAssistant's Answer:\n{response}\n\nUse double square brackets to format your scores, like so: [[7]].\n"
	messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False
	)
	prompts.append(text)
	outputs = llm.generate(prompts=prompts, sampling_params=sampling_params)
	for output, index in zip(outputs, indexes):
	d = json.loads(lines2[index])
	d["battle"] = output.outputs[0].text
	d["index"] = index
	fw.write(json.dumps(d)+"\n")