learn / test_time_scaling /concept_difficulty_augment.py

Upload folder using huggingface_hub

a80f6e6 verified 10 months ago

3.57 kB

	import os
	import json
	from datasets import load_dataset
	from vllm import LLM, SamplingParams
	from transformers import AutoTokenizer
	from tqdm import tqdm
	import pandas as pd

	def ensure_dir(path):
	if not os.path.exists(path):
	os.makedirs(path)

	# Prompt模板
	PROMPT_TEMPLATE = '''Given the following question, design a new question that involves the same key concepts but is {difficulty} than the original. \
	The new question should have the same format (question stem and options). \
	Output your result in the following JSON format strictly (do not add any explanation):

	{{"question": "...", "options": {{"A": "...", "B": "...", "C": "...", "D": "..."}}}}

	Original question:
	Question: {question}
	A. {A}
	B. {B}
	C. {C}
	D. {D}'''

	# 支持的难度类型
	difficulty_types = ["easier", "harder"]

	# 主处理函数
	def generate_difficulty_augmented_questions(model_name, dataset_name, dataset_config, split, difficulty="easier", save_dir="data/concept_difficulty_augment", max_questions=None):
	llm = LLM(model=model_name, tensor_parallel_size=2)
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	ds = load_dataset(dataset_name, dataset_config, split=split)
	save_path = os.path.join(save_dir, model_name.replace("/", "__"), dataset_config, difficulty)
	ensure_dir(save_path)
	results = []
	for idx, item in enumerate(tqdm(ds, desc=f"{dataset_config} \| {difficulty}")):
	if max_questions is not None and idx >= max_questions:
	break
	options = {chr(65+i): item['choices'][i] for i in range(len(item['choices']))}
	prompt = PROMPT_TEMPLATE.format(
	difficulty=difficulty,
	question=item['question'],
	A=options.get('A', ''),
	B=options.get('B', ''),
	C=options.get('C', ''),
	D=options.get('D', '')
	)
	outputs = llm.generate([prompt], SamplingParams(temperature=0.7, top_p=0.95, max_tokens=1024, stop=["User:", "<\|endoftext\|>","You are a helpful assistant."]))
	model_output = outputs[0].outputs[0].text.strip()
	out = {
	"original_question": item['question'],
	"original_options": options,
	"prompt": prompt,
	"difficulty": difficulty,
	"model_output": model_output
	}
	fname = os.path.join(save_path, f"question_{idx:04d}.json")
	with open(fname, 'w', encoding='utf-8') as f:
	json.dump(out, f, ensure_ascii=False, indent=2)
	results.append(out)
	# 保存整体结果
	with open(os.path.join(save_path, "all_results.json"), 'w', encoding='utf-8') as f:
	json.dump(results, f, ensure_ascii=False, indent=2)
	print(f"Saved {len(results)} augmented questions to {save_path}")

	# 可批量处理多个数据集和难度
	def batch_generate(model_name, dataset_configs, split="test", difficulties=None, max_questions=None):
	if difficulties is None:
	difficulties = difficulty_types
	for dataset_config in dataset_configs:
	for difficulty in difficulties:
	generate_difficulty_augmented_questions(
	model_name=model_name,
	dataset_name="cais/mmlu",
	dataset_config=dataset_config,
	split=split,
	difficulty=difficulty,
	max_questions=max_questions
	)

	if __name__ == "__main__":
	MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
	DATASETS = ["abstract_algebra"]
	batch_generate(MODEL_NAME, DATASETS, split="test", difficulties=["easier", "harder"], max_questions=None)