learn / test_time_scaling /process_model_method.py
unfair11212's picture
Upload folder using huggingface_hub
a80f6e6 verified
import os
import json
from datasets import load_dataset
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
import re
MODEL_NAMES = [
"Qwen/Qwen2.5-7B-Instruct",
# "meta-llama/Llama-2-7b-chat-hf",
# 其他模型名...
]
DATASETS = ["college_computer_science"] # ,"abstract_algebra", "college_mathematics", "conceptual_physics"]
SAVE_DIR = "data/model_method"
PROMPT_SUFFIX = (
"Please identify the key conceptual topics or knowledge components required to solve this question. "
"List 3 such concepts that are most essential for understanding and solving the problem. "
"You don't need to explain the concepts or solve the problem, just list them. Output your answer in the following JSON format:\n"
'{"concepts": ["concept1", "concept2", "concept3"]}'
)
def ensure_dir(path):
if not os.path.exists(path):
os.makedirs(path)
def format_prompt(q):
question = q['question']
options = q['options']
prompt = f"{question}\n"
for key in sorted(options.keys()):
prompt += f"{key}. {options[key]}\n"
prompt += PROMPT_SUFFIX
return prompt
def parse_model_output(output):
# 用正则表达式提取 concepts
pattern = r'"concepts"\s*:\s*\[(.*?)\]'
match = re.search(pattern, output, re.DOTALL)
if match:
concepts_str = match.group(1)
# 提取每个 concept
concepts = re.findall(r'"(.*?)"', concepts_str)
concepts = concepts[:TOP_K]
uncertainties = [None] * len(concepts)
return concepts, uncertainties
else:
return [], []
def main():
for model_name in MODEL_NAMES:
print(f"Processing model: {model_name}")
llm = LLM(model=model_name, tensor_parallel_size=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
for subset in DATASETS:
print(f"Processing {subset}...")
ds = load_dataset("cais/mmlu", subset, split="test")
save_path = os.path.join(SAVE_DIR, model_name.replace("/", "__"), subset)
ensure_dir(save_path)
for idx, item in enumerate(ds):
options = {chr(65+i): item['choices'][i] for i in range(len(item['choices']))}
answer = chr(65 + item['answer'])
qdict = {"question": item['question'], "options": options, "answer": answer}
prompt = format_prompt(qdict)
outputs = llm.generate([prompt], SamplingParams(temperature=0.2, top_p=0.9, max_tokens=256, stop=["User:", "<|endoftext|>",]))
model_output = outputs[0].outputs[0].text.strip()
topk_tokens, uncertainties = parse_model_output(model_output)
out = {
"question": qdict["question"],
"options": qdict["options"],
"answer": qdict["answer"],
"topk_tokens": topk_tokens,
"uncertainties": uncertainties,
"raw_output": model_output
}
fname = os.path.join(save_path, f"question_{idx:04d}.json")
with open(fname, 'w', encoding='utf-8') as f:
json.dump(out, f, ensure_ascii=False, indent=2)
if idx % 20 == 0:
print(f"Saved {fname}")
if __name__ == "__main__":
main()