unfair11212
/

learn

Model card Files Files and versions

learn / test_time_scaling /process_prob_method.py

unfair11212's picture

Upload folder using huggingface_hub

a80f6e6 verified 10 months ago

history blame contribute delete

4.02 kB

	import os
	import json
	from datasets import load_dataset
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch
	import numpy as np

	MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
	DATASETS = ["abstract_algebra", "college_mathematics"]
	TOP_K = 5
	SAVE_DIR = "data/prob_method"

	STOPWORDS = {
	"that", "of", "the", "and", "to", "in", "for", "with", "on", "at", "by", "from", "as", "is", "are", "was", "were",
	"be", "been", "being", "a", "an", "but", "or", "if", "because", "so", "do", "does", "did", "not", "no", "yes",
	"can", "could", "should", "would", "will", "shall", "may", "might", "must", "this", "that", "these", "those",
	"it", "its", "he", "she", "they", "them", "his", "her", "their", "you", "your", "we", "our", "i", "me", "my",
	"mine", "who", "whom", "which", "what", "when", "where", "why", "how", "also", "than", "then", "there", "here",
	"such", "other", "some", "any", "all", "each", "every", "either", "neither", "both", "few", "more", "most", "much", "many"
	}

	def ensure_dir(path):
	if not os.path.exists(path):
	os.makedirs(path)

	def get_per_token_prob(model, tokenizer, prompt):
	inputs = tokenizer(prompt, return_tensors="pt")
	input_ids = inputs.input_ids.to(model.device)
	with torch.no_grad():
	outputs = model(input_ids)
	logits = outputs.logits
	# 对每个位置，softmax得到概率
	probs = torch.softmax(logits, dim=-1)
	# 取每个位置真实token的概率
	shift_probs = probs[:, :-1, :].contiguous()
	shift_labels = input_ids[:, 1:].contiguous()
	# gather出每个真实token的概率
	token_probs = shift_probs.gather(-1, shift_labels.unsqueeze(-1)).squeeze(-1)
	token_probs = token_probs.squeeze(0).cpu().numpy()
	tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0).cpu())
	clean_tokens = [token.lstrip('Ġ') for token in tokens[1:]]
	return clean_tokens, token_probs # skip the first token (BOS)

	def format_prompt(q):
	# Only use the question itself as prompt
	return q['question']

	def main():
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
	model.eval()
	for subset in DATASETS:
	print(f"Processing {subset}...")
	ds = load_dataset("cais/mmlu", subset, split="test")
	save_path = os.path.join(SAVE_DIR, subset)
	ensure_dir(save_path)
	for idx, item in enumerate(ds):
	options = {chr(65+i): item['choices'][i] for i in range(len(item['choices']))}
	answer = chr(65 + item['answer'])
	qdict = {"question": item['question'], "options": options, "answer": answer}
	prompt = format_prompt(qdict)
	tokens, per_token_ppl = get_per_token_prob(model, tokenizer, prompt)
	# 过滤掉长度为1和stopwords的token
	token_info = [
	(i, tokens[i], per_token_ppl[i])
	for i in range(len(tokens))
	if len(tokens[i]) > 1 and tokens[i].lower() not in STOPWORDS
	]
	if len(token_info) < TOP_K:
	topk = sorted(token_info, key=lambda x: x[2])[:TOP_K]
	else:
	topk = sorted(token_info, key=lambda x: x[2])[:TOP_K]
	topk_tokens = [x[1] for x in topk]
	uncertainties = [float(x[2]) for x in topk]
	out = {
	"question": qdict["question"],
	"options": qdict["options"],
	"answer": qdict["answer"],
	"topk_tokens": topk_tokens,
	"uncertainties": uncertainties
	}
	fname = os.path.join(save_path, f"question_{idx:04d}.json")
	with open(fname, 'w', encoding='utf-8') as f:
	json.dump(out, f, ensure_ascii=False, indent=2)
	if idx % 20 == 0:
	print(f"Saved {fname}")

	if __name__ == "__main__":
	main()