import os import json from datasets import load_dataset from vllm import LLM, SamplingParams from transformers import AutoTokenizer from tqdm import tqdm import pandas as pd def ensure_dir(path): if not os.path.exists(path): os.makedirs(path) # Prompt模板 PROMPT_TEMPLATE = '''Given the following question, design a new question that involves the same key concepts but is {difficulty} than the original. \ The new question should have the same format (question stem and options). \ Output your result in the following JSON format strictly (do not add any explanation): {{"question": "...", "options": {{"A": "...", "B": "...", "C": "...", "D": "..."}}}} Original question: Question: {question} A. {A} B. {B} C. {C} D. {D}''' # 支持的难度类型 difficulty_types = ["easier", "harder"] # 主处理函数 def generate_difficulty_augmented_questions(model_name, dataset_name, dataset_config, split, difficulty="easier", save_dir="data/concept_difficulty_augment", max_questions=None): llm = LLM(model=model_name, tensor_parallel_size=2) tokenizer = AutoTokenizer.from_pretrained(model_name) ds = load_dataset(dataset_name, dataset_config, split=split) save_path = os.path.join(save_dir, model_name.replace("/", "__"), dataset_config, difficulty) ensure_dir(save_path) results = [] for idx, item in enumerate(tqdm(ds, desc=f"{dataset_config} | {difficulty}")): if max_questions is not None and idx >= max_questions: break options = {chr(65+i): item['choices'][i] for i in range(len(item['choices']))} prompt = PROMPT_TEMPLATE.format( difficulty=difficulty, question=item['question'], A=options.get('A', ''), B=options.get('B', ''), C=options.get('C', ''), D=options.get('D', '') ) outputs = llm.generate([prompt], SamplingParams(temperature=0.7, top_p=0.95, max_tokens=1024, stop=["User:", "<|endoftext|>","You are a helpful assistant."])) model_output = outputs[0].outputs[0].text.strip() out = { "original_question": item['question'], "original_options": options, "prompt": prompt, "difficulty": difficulty, "model_output": model_output } fname = os.path.join(save_path, f"question_{idx:04d}.json") with open(fname, 'w', encoding='utf-8') as f: json.dump(out, f, ensure_ascii=False, indent=2) results.append(out) # 保存整体结果 with open(os.path.join(save_path, "all_results.json"), 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"Saved {len(results)} augmented questions to {save_path}") # 可批量处理多个数据集和难度 def batch_generate(model_name, dataset_configs, split="test", difficulties=None, max_questions=None): if difficulties is None: difficulties = difficulty_types for dataset_config in dataset_configs: for difficulty in difficulties: generate_difficulty_augmented_questions( model_name=model_name, dataset_name="cais/mmlu", dataset_config=dataset_config, split=split, difficulty=difficulty, max_questions=max_questions ) if __name__ == "__main__": MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct" DATASETS = ["abstract_algebra"] batch_generate(MODEL_NAME, DATASETS, split="test", difficulties=["easier", "harder"], max_questions=None)