learn / test_time_scaling /process_model_method.py

Upload folder using huggingface_hub

a80f6e6 verified 10 months ago

3.35 kB

	import os
	import json
	from datasets import load_dataset
	from vllm import LLM, SamplingParams
	from transformers import AutoTokenizer
	import re

	MODEL_NAMES = [
	"Qwen/Qwen2.5-7B-Instruct",
	# "meta-llama/Llama-2-7b-chat-hf",
	# 其他模型名...
	]

	DATASETS = ["college_computer_science"] # ,"abstract_algebra", "college_mathematics", "conceptual_physics"]

	SAVE_DIR = "data/model_method"

	PROMPT_SUFFIX = (
	"Please identify the key conceptual topics or knowledge components required to solve this question. "
	"List 3 such concepts that are most essential for understanding and solving the problem. "
	"You don't need to explain the concepts or solve the problem, just list them. Output your answer in the following JSON format:\n"
	'{"concepts": ["concept1", "concept2", "concept3"]}'
	)

	def ensure_dir(path):
	if not os.path.exists(path):
	os.makedirs(path)

	def format_prompt(q):
	question = q['question']
	options = q['options']
	prompt = f"{question}\n"
	for key in sorted(options.keys()):
	prompt += f"{key}. {options[key]}\n"
	prompt += PROMPT_SUFFIX
	return prompt

	def parse_model_output(output):
	# 用正则表达式提取 concepts
	pattern = r'"concepts"\s:\s\[(.*?)\]'
	match = re.search(pattern, output, re.DOTALL)
	if match:
	concepts_str = match.group(1)
	# 提取每个 concept
	concepts = re.findall(r'"(.*?)"', concepts_str)
	concepts = concepts[:TOP_K]
	uncertainties = [None] * len(concepts)
	return concepts, uncertainties
	else:
	return [], []

	def main():
	for model_name in MODEL_NAMES:
	print(f"Processing model: {model_name}")
	llm = LLM(model=model_name, tensor_parallel_size=2)
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	for subset in DATASETS:
	print(f"Processing {subset}...")
	ds = load_dataset("cais/mmlu", subset, split="test")
	save_path = os.path.join(SAVE_DIR, model_name.replace("/", "__"), subset)
	ensure_dir(save_path)
	for idx, item in enumerate(ds):
	options = {chr(65+i): item['choices'][i] for i in range(len(item['choices']))}
	answer = chr(65 + item['answer'])
	qdict = {"question": item['question'], "options": options, "answer": answer}
	prompt = format_prompt(qdict)
	outputs = llm.generate([prompt], SamplingParams(temperature=0.2, top_p=0.9, max_tokens=256, stop=["User:", "<\|endoftext\|>",]))
	model_output = outputs[0].outputs[0].text.strip()
	topk_tokens, uncertainties = parse_model_output(model_output)
	out = {
	"question": qdict["question"],
	"options": qdict["options"],
	"answer": qdict["answer"],
	"topk_tokens": topk_tokens,
	"uncertainties": uncertainties,
	"raw_output": model_output
	}
	fname = os.path.join(save_path, f"question_{idx:04d}.json")
	with open(fname, 'w', encoding='utf-8') as f:
	json.dump(out, f, ensure_ascii=False, indent=2)
	if idx % 20 == 0:
	print(f"Saved {fname}")

	if __name__ == "__main__":
	main()