learn / test_time_scaling /process_prob_method.py
unfair11212's picture
Upload folder using huggingface_hub
a80f6e6 verified
import os
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
DATASETS = ["abstract_algebra", "college_mathematics"]
TOP_K = 5
SAVE_DIR = "data/prob_method"
STOPWORDS = {
"that", "of", "the", "and", "to", "in", "for", "with", "on", "at", "by", "from", "as", "is", "are", "was", "were",
"be", "been", "being", "a", "an", "but", "or", "if", "because", "so", "do", "does", "did", "not", "no", "yes",
"can", "could", "should", "would", "will", "shall", "may", "might", "must", "this", "that", "these", "those",
"it", "its", "he", "she", "they", "them", "his", "her", "their", "you", "your", "we", "our", "i", "me", "my",
"mine", "who", "whom", "which", "what", "when", "where", "why", "how", "also", "than", "then", "there", "here",
"such", "other", "some", "any", "all", "each", "every", "either", "neither", "both", "few", "more", "most", "much", "many"
}
def ensure_dir(path):
if not os.path.exists(path):
os.makedirs(path)
def get_per_token_prob(model, tokenizer, prompt):
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs.input_ids.to(model.device)
with torch.no_grad():
outputs = model(input_ids)
logits = outputs.logits
# 对每个位置,softmax得到概率
probs = torch.softmax(logits, dim=-1)
# 取每个位置真实token的概率
shift_probs = probs[:, :-1, :].contiguous()
shift_labels = input_ids[:, 1:].contiguous()
# gather出每个真实token的概率
token_probs = shift_probs.gather(-1, shift_labels.unsqueeze(-1)).squeeze(-1)
token_probs = token_probs.squeeze(0).cpu().numpy()
tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0).cpu())
clean_tokens = [token.lstrip('Ġ') for token in tokens[1:]]
return clean_tokens, token_probs # skip the first token (BOS)
def format_prompt(q):
# Only use the question itself as prompt
return q['question']
def main():
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
model.eval()
for subset in DATASETS:
print(f"Processing {subset}...")
ds = load_dataset("cais/mmlu", subset, split="test")
save_path = os.path.join(SAVE_DIR, subset)
ensure_dir(save_path)
for idx, item in enumerate(ds):
options = {chr(65+i): item['choices'][i] for i in range(len(item['choices']))}
answer = chr(65 + item['answer'])
qdict = {"question": item['question'], "options": options, "answer": answer}
prompt = format_prompt(qdict)
tokens, per_token_ppl = get_per_token_prob(model, tokenizer, prompt)
# 过滤掉长度为1和stopwords的token
token_info = [
(i, tokens[i], per_token_ppl[i])
for i in range(len(tokens))
if len(tokens[i]) > 1 and tokens[i].lower() not in STOPWORDS
]
if len(token_info) < TOP_K:
topk = sorted(token_info, key=lambda x: x[2])[:TOP_K]
else:
topk = sorted(token_info, key=lambda x: x[2])[:TOP_K]
topk_tokens = [x[1] for x in topk]
uncertainties = [float(x[2]) for x in topk]
out = {
"question": qdict["question"],
"options": qdict["options"],
"answer": qdict["answer"],
"topk_tokens": topk_tokens,
"uncertainties": uncertainties
}
fname = os.path.join(save_path, f"question_{idx:04d}.json")
with open(fname, 'w', encoding='utf-8') as f:
json.dump(out, f, ensure_ascii=False, indent=2)
if idx % 20 == 0:
print(f"Saved {fname}")
if __name__ == "__main__":
main()