File size: 3,572 Bytes
a80f6e6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | import os
import json
from datasets import load_dataset
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from tqdm import tqdm
import pandas as pd
def ensure_dir(path):
if not os.path.exists(path):
os.makedirs(path)
# Prompt模板
PROMPT_TEMPLATE = '''Given the following question, design a new question that involves the same key concepts but is {difficulty} than the original. \
The new question should have the same format (question stem and options). \
Output your result in the following JSON format strictly (do not add any explanation):
{{"question": "...", "options": {{"A": "...", "B": "...", "C": "...", "D": "..."}}}}
Original question:
Question: {question}
A. {A}
B. {B}
C. {C}
D. {D}'''
# 支持的难度类型
difficulty_types = ["easier", "harder"]
# 主处理函数
def generate_difficulty_augmented_questions(model_name, dataset_name, dataset_config, split, difficulty="easier", save_dir="data/concept_difficulty_augment", max_questions=None):
llm = LLM(model=model_name, tensor_parallel_size=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
ds = load_dataset(dataset_name, dataset_config, split=split)
save_path = os.path.join(save_dir, model_name.replace("/", "__"), dataset_config, difficulty)
ensure_dir(save_path)
results = []
for idx, item in enumerate(tqdm(ds, desc=f"{dataset_config} | {difficulty}")):
if max_questions is not None and idx >= max_questions:
break
options = {chr(65+i): item['choices'][i] for i in range(len(item['choices']))}
prompt = PROMPT_TEMPLATE.format(
difficulty=difficulty,
question=item['question'],
A=options.get('A', ''),
B=options.get('B', ''),
C=options.get('C', ''),
D=options.get('D', '')
)
outputs = llm.generate([prompt], SamplingParams(temperature=0.7, top_p=0.95, max_tokens=1024, stop=["User:", "<|endoftext|>","You are a helpful assistant."]))
model_output = outputs[0].outputs[0].text.strip()
out = {
"original_question": item['question'],
"original_options": options,
"prompt": prompt,
"difficulty": difficulty,
"model_output": model_output
}
fname = os.path.join(save_path, f"question_{idx:04d}.json")
with open(fname, 'w', encoding='utf-8') as f:
json.dump(out, f, ensure_ascii=False, indent=2)
results.append(out)
# 保存整体结果
with open(os.path.join(save_path, "all_results.json"), 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"Saved {len(results)} augmented questions to {save_path}")
# 可批量处理多个数据集和难度
def batch_generate(model_name, dataset_configs, split="test", difficulties=None, max_questions=None):
if difficulties is None:
difficulties = difficulty_types
for dataset_config in dataset_configs:
for difficulty in difficulties:
generate_difficulty_augmented_questions(
model_name=model_name,
dataset_name="cais/mmlu",
dataset_config=dataset_config,
split=split,
difficulty=difficulty,
max_questions=max_questions
)
if __name__ == "__main__":
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
DATASETS = ["abstract_algebra"]
batch_generate(MODEL_NAME, DATASETS, split="test", difficulties=["easier", "harder"], max_questions=None) |