import os import json from datasets import load_dataset from vllm import LLM, SamplingParams from transformers import AutoTokenizer import re MODEL_NAMES = [ "Qwen/Qwen2.5-7B-Instruct", # "meta-llama/Llama-2-7b-chat-hf", # 其他模型名... ] DATASETS = ["college_computer_science"] # ,"abstract_algebra", "college_mathematics", "conceptual_physics"] SAVE_DIR = "data/model_method" PROMPT_SUFFIX = ( "Please identify the key conceptual topics or knowledge components required to solve this question. " "List 3 such concepts that are most essential for understanding and solving the problem. " "You don't need to explain the concepts or solve the problem, just list them. Output your answer in the following JSON format:\n" '{"concepts": ["concept1", "concept2", "concept3"]}' ) def ensure_dir(path): if not os.path.exists(path): os.makedirs(path) def format_prompt(q): question = q['question'] options = q['options'] prompt = f"{question}\n" for key in sorted(options.keys()): prompt += f"{key}. {options[key]}\n" prompt += PROMPT_SUFFIX return prompt def parse_model_output(output): # 用正则表达式提取 concepts pattern = r'"concepts"\s*:\s*\[(.*?)\]' match = re.search(pattern, output, re.DOTALL) if match: concepts_str = match.group(1) # 提取每个 concept concepts = re.findall(r'"(.*?)"', concepts_str) concepts = concepts[:TOP_K] uncertainties = [None] * len(concepts) return concepts, uncertainties else: return [], [] def main(): for model_name in MODEL_NAMES: print(f"Processing model: {model_name}") llm = LLM(model=model_name, tensor_parallel_size=2) tokenizer = AutoTokenizer.from_pretrained(model_name) for subset in DATASETS: print(f"Processing {subset}...") ds = load_dataset("cais/mmlu", subset, split="test") save_path = os.path.join(SAVE_DIR, model_name.replace("/", "__"), subset) ensure_dir(save_path) for idx, item in enumerate(ds): options = {chr(65+i): item['choices'][i] for i in range(len(item['choices']))} answer = chr(65 + item['answer']) qdict = {"question": item['question'], "options": options, "answer": answer} prompt = format_prompt(qdict) outputs = llm.generate([prompt], SamplingParams(temperature=0.2, top_p=0.9, max_tokens=256, stop=["User:", "<|endoftext|>",])) model_output = outputs[0].outputs[0].text.strip() topk_tokens, uncertainties = parse_model_output(model_output) out = { "question": qdict["question"], "options": qdict["options"], "answer": qdict["answer"], "topk_tokens": topk_tokens, "uncertainties": uncertainties, "raw_output": model_output } fname = os.path.join(save_path, f"question_{idx:04d}.json") with open(fname, 'w', encoding='utf-8') as f: json.dump(out, f, ensure_ascii=False, indent=2) if idx % 20 == 0: print(f"Saved {fname}") if __name__ == "__main__": main()