| import os |
| import json |
| from datasets import load_dataset |
| from vllm import LLM, SamplingParams |
| from transformers import AutoTokenizer |
| from tqdm import tqdm |
| import pandas as pd |
|
|
| def ensure_dir(path): |
| if not os.path.exists(path): |
| os.makedirs(path) |
|
|
| |
| PROMPT_TEMPLATE = '''Given the following question, design a new question that involves the same key concepts but is {difficulty} than the original. \ |
| The new question should have the same format (question stem and options). \ |
| Output your result in the following JSON format strictly (do not add any explanation): |
| |
| {{"question": "...", "options": {{"A": "...", "B": "...", "C": "...", "D": "..."}}}} |
| |
| Original question: |
| Question: {question} |
| A. {A} |
| B. {B} |
| C. {C} |
| D. {D}''' |
|
|
| |
| difficulty_types = ["easier", "harder"] |
|
|
| |
| def generate_difficulty_augmented_questions(model_name, dataset_name, dataset_config, split, difficulty="easier", save_dir="data/concept_difficulty_augment", max_questions=None): |
| llm = LLM(model=model_name, tensor_parallel_size=2) |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| ds = load_dataset(dataset_name, dataset_config, split=split) |
| save_path = os.path.join(save_dir, model_name.replace("/", "__"), dataset_config, difficulty) |
| ensure_dir(save_path) |
| results = [] |
| for idx, item in enumerate(tqdm(ds, desc=f"{dataset_config} | {difficulty}")): |
| if max_questions is not None and idx >= max_questions: |
| break |
| options = {chr(65+i): item['choices'][i] for i in range(len(item['choices']))} |
| prompt = PROMPT_TEMPLATE.format( |
| difficulty=difficulty, |
| question=item['question'], |
| A=options.get('A', ''), |
| B=options.get('B', ''), |
| C=options.get('C', ''), |
| D=options.get('D', '') |
| ) |
| outputs = llm.generate([prompt], SamplingParams(temperature=0.7, top_p=0.95, max_tokens=1024, stop=["User:", "<|endoftext|>","You are a helpful assistant."])) |
| model_output = outputs[0].outputs[0].text.strip() |
| out = { |
| "original_question": item['question'], |
| "original_options": options, |
| "prompt": prompt, |
| "difficulty": difficulty, |
| "model_output": model_output |
| } |
| fname = os.path.join(save_path, f"question_{idx:04d}.json") |
| with open(fname, 'w', encoding='utf-8') as f: |
| json.dump(out, f, ensure_ascii=False, indent=2) |
| results.append(out) |
| |
| with open(os.path.join(save_path, "all_results.json"), 'w', encoding='utf-8') as f: |
| json.dump(results, f, ensure_ascii=False, indent=2) |
| print(f"Saved {len(results)} augmented questions to {save_path}") |
|
|
| |
| def batch_generate(model_name, dataset_configs, split="test", difficulties=None, max_questions=None): |
| if difficulties is None: |
| difficulties = difficulty_types |
| for dataset_config in dataset_configs: |
| for difficulty in difficulties: |
| generate_difficulty_augmented_questions( |
| model_name=model_name, |
| dataset_name="cais/mmlu", |
| dataset_config=dataset_config, |
| split=split, |
| difficulty=difficulty, |
| max_questions=max_questions |
| ) |
|
|
| if __name__ == "__main__": |
| MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct" |
| DATASETS = ["abstract_algebra"] |
| batch_generate(MODEL_NAME, DATASETS, split="test", difficulties=["easier", "harder"], max_questions=None) |