| import os |
| import json |
| from datasets import load_dataset |
| from vllm import LLM, SamplingParams |
| from transformers import AutoTokenizer |
| import re |
|
|
| MODEL_NAMES = [ |
| "Qwen/Qwen2.5-7B-Instruct", |
| |
| |
| ] |
|
|
| DATASETS = ["college_computer_science"] |
|
|
| SAVE_DIR = "data/model_method" |
|
|
| PROMPT_SUFFIX = ( |
| "Please identify the key conceptual topics or knowledge components required to solve this question. " |
| "List 3 such concepts that are most essential for understanding and solving the problem. " |
| "You don't need to explain the concepts or solve the problem, just list them. Output your answer in the following JSON format:\n" |
| '{"concepts": ["concept1", "concept2", "concept3"]}' |
| ) |
|
|
| def ensure_dir(path): |
| if not os.path.exists(path): |
| os.makedirs(path) |
|
|
| def format_prompt(q): |
| question = q['question'] |
| options = q['options'] |
| prompt = f"{question}\n" |
| for key in sorted(options.keys()): |
| prompt += f"{key}. {options[key]}\n" |
| prompt += PROMPT_SUFFIX |
| return prompt |
|
|
| def parse_model_output(output): |
| |
| pattern = r'"concepts"\s*:\s*\[(.*?)\]' |
| match = re.search(pattern, output, re.DOTALL) |
| if match: |
| concepts_str = match.group(1) |
| |
| concepts = re.findall(r'"(.*?)"', concepts_str) |
| concepts = concepts[:TOP_K] |
| uncertainties = [None] * len(concepts) |
| return concepts, uncertainties |
| else: |
| return [], [] |
|
|
| def main(): |
| for model_name in MODEL_NAMES: |
| print(f"Processing model: {model_name}") |
| llm = LLM(model=model_name, tensor_parallel_size=2) |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| for subset in DATASETS: |
| print(f"Processing {subset}...") |
| ds = load_dataset("cais/mmlu", subset, split="test") |
| save_path = os.path.join(SAVE_DIR, model_name.replace("/", "__"), subset) |
| ensure_dir(save_path) |
| for idx, item in enumerate(ds): |
| options = {chr(65+i): item['choices'][i] for i in range(len(item['choices']))} |
| answer = chr(65 + item['answer']) |
| qdict = {"question": item['question'], "options": options, "answer": answer} |
| prompt = format_prompt(qdict) |
| outputs = llm.generate([prompt], SamplingParams(temperature=0.2, top_p=0.9, max_tokens=256, stop=["User:", "<|endoftext|>",])) |
| model_output = outputs[0].outputs[0].text.strip() |
| topk_tokens, uncertainties = parse_model_output(model_output) |
| out = { |
| "question": qdict["question"], |
| "options": qdict["options"], |
| "answer": qdict["answer"], |
| "topk_tokens": topk_tokens, |
| "uncertainties": uncertainties, |
| "raw_output": model_output |
| } |
| fname = os.path.join(save_path, f"question_{idx:04d}.json") |
| with open(fname, 'w', encoding='utf-8') as f: |
| json.dump(out, f, ensure_ascii=False, indent=2) |
| if idx % 20 == 0: |
| print(f"Saved {fname}") |
|
|
| if __name__ == "__main__": |
| main() |
|
|