File size: 3,572 Bytes

a80f6e6

import os
import json
from datasets import load_dataset
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from tqdm import tqdm
import pandas as pd

def ensure_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

# Prompt模板
PROMPT_TEMPLATE = '''Given the following question, design a new question that involves the same key concepts but is {difficulty} than the original. \
The new question should have the same format (question stem and options). \
Output your result in the following JSON format strictly (do not add any explanation):

{{"question": "...", "options": {{"A": "...", "B": "...", "C": "...", "D": "..."}}}}

Original question:
Question: {question}
A. {A}
B. {B}
C. {C}
D. {D}'''

# 支持的难度类型
difficulty_types = ["easier", "harder"]

# 主处理函数
def generate_difficulty_augmented_questions(model_name, dataset_name, dataset_config, split, difficulty="easier", save_dir="data/concept_difficulty_augment", max_questions=None):
    llm = LLM(model=model_name, tensor_parallel_size=2)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    ds = load_dataset(dataset_name, dataset_config, split=split)
    save_path = os.path.join(save_dir, model_name.replace("/", "__"), dataset_config, difficulty)
    ensure_dir(save_path)
    results = []
    for idx, item in enumerate(tqdm(ds, desc=f"{dataset_config} | {difficulty}")):
        if max_questions is not None and idx >= max_questions:
            break
        options = {chr(65+i): item['choices'][i] for i in range(len(item['choices']))}
        prompt = PROMPT_TEMPLATE.format(
            difficulty=difficulty,
            question=item['question'],
            A=options.get('A', ''),
            B=options.get('B', ''),
            C=options.get('C', ''),
            D=options.get('D', '')
        )
        outputs = llm.generate([prompt], SamplingParams(temperature=0.7, top_p=0.95, max_tokens=1024, stop=["User:", "<|endoftext|>","You are a helpful assistant."]))
        model_output = outputs[0].outputs[0].text.strip()
        out = {
            "original_question": item['question'],
            "original_options": options,
            "prompt": prompt,
            "difficulty": difficulty,
            "model_output": model_output
        }
        fname = os.path.join(save_path, f"question_{idx:04d}.json")
        with open(fname, 'w', encoding='utf-8') as f:
            json.dump(out, f, ensure_ascii=False, indent=2)
        results.append(out)
    # 保存整体结果
    with open(os.path.join(save_path, "all_results.json"), 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"Saved {len(results)} augmented questions to {save_path}")

# 可批量处理多个数据集和难度
def batch_generate(model_name, dataset_configs, split="test", difficulties=None, max_questions=None):
    if difficulties is None:
        difficulties = difficulty_types
    for dataset_config in dataset_configs:
        for difficulty in difficulties:
            generate_difficulty_augmented_questions(
                model_name=model_name,
                dataset_name="cais/mmlu",
                dataset_config=dataset_config,
                split=split,
                difficulty=difficulty,
                max_questions=max_questions
            )

if __name__ == "__main__":
    MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
    DATASETS = ["abstract_algebra"]
    batch_generate(MODEL_NAME, DATASETS, split="test", difficulties=["easier", "harder"], max_questions=None)