File size: 3,347 Bytes
a80f6e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import json
from datasets import load_dataset
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
import re

MODEL_NAMES = [
    "Qwen/Qwen2.5-7B-Instruct",
    # "meta-llama/Llama-2-7b-chat-hf",
    # 其他模型名...
]

DATASETS = ["college_computer_science"]  # ,"abstract_algebra", "college_mathematics", "conceptual_physics"]

SAVE_DIR = "data/model_method"

PROMPT_SUFFIX = (
    "Please identify the key conceptual topics or knowledge components required to solve this question. "
    "List 3 such concepts that are most essential for understanding and solving the problem. "
    "You don't need to explain the concepts or solve the problem, just list them. Output your answer in the following JSON format:\n"
    '{"concepts": ["concept1", "concept2", "concept3"]}'
)

def ensure_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def format_prompt(q):
    question = q['question']
    options = q['options']
    prompt = f"{question}\n"
    for key in sorted(options.keys()):
        prompt += f"{key}. {options[key]}\n"
    prompt += PROMPT_SUFFIX
    return prompt

def parse_model_output(output):
    # 用正则表达式提取 concepts
    pattern = r'"concepts"\s*:\s*\[(.*?)\]'
    match = re.search(pattern, output, re.DOTALL)
    if match:
        concepts_str = match.group(1)
        # 提取每个 concept
        concepts = re.findall(r'"(.*?)"', concepts_str)
        concepts = concepts[:TOP_K]
        uncertainties = [None] * len(concepts)
        return concepts, uncertainties
    else:
        return [], []

def main():
    for model_name in MODEL_NAMES:
        print(f"Processing model: {model_name}")
        llm = LLM(model=model_name, tensor_parallel_size=2)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        for subset in DATASETS:
            print(f"Processing {subset}...")
            ds = load_dataset("cais/mmlu", subset, split="test")
            save_path = os.path.join(SAVE_DIR, model_name.replace("/", "__"), subset)
            ensure_dir(save_path)
            for idx, item in enumerate(ds):
                options = {chr(65+i): item['choices'][i] for i in range(len(item['choices']))}
                answer = chr(65 + item['answer'])
                qdict = {"question": item['question'], "options": options, "answer": answer}
                prompt = format_prompt(qdict)
                outputs = llm.generate([prompt], SamplingParams(temperature=0.2, top_p=0.9, max_tokens=256, stop=["User:", "<|endoftext|>",]))
                model_output = outputs[0].outputs[0].text.strip()
                topk_tokens, uncertainties = parse_model_output(model_output)
                out = {
                    "question": qdict["question"],
                    "options": qdict["options"],
                    "answer": qdict["answer"],
                    "topk_tokens": topk_tokens,
                    "uncertainties": uncertainties,
                    "raw_output": model_output
                }
                fname = os.path.join(save_path, f"question_{idx:04d}.json")
                with open(fname, 'w', encoding='utf-8') as f:
                    json.dump(out, f, ensure_ascii=False, indent=2)
                if idx % 20 == 0:
                    print(f"Saved {fname}")

if __name__ == "__main__":
    main()