| import os |
| import json |
| from datasets import load_dataset |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
| import torch |
| import numpy as np |
|
|
| MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct" |
| DATASETS = ["abstract_algebra", "college_mathematics"] |
| TOP_K = 5 |
| SAVE_DIR = "data/prob_method" |
|
|
| STOPWORDS = { |
| "that", "of", "the", "and", "to", "in", "for", "with", "on", "at", "by", "from", "as", "is", "are", "was", "were", |
| "be", "been", "being", "a", "an", "but", "or", "if", "because", "so", "do", "does", "did", "not", "no", "yes", |
| "can", "could", "should", "would", "will", "shall", "may", "might", "must", "this", "that", "these", "those", |
| "it", "its", "he", "she", "they", "them", "his", "her", "their", "you", "your", "we", "our", "i", "me", "my", |
| "mine", "who", "whom", "which", "what", "when", "where", "why", "how", "also", "than", "then", "there", "here", |
| "such", "other", "some", "any", "all", "each", "every", "either", "neither", "both", "few", "more", "most", "much", "many" |
| } |
|
|
| def ensure_dir(path): |
| if not os.path.exists(path): |
| os.makedirs(path) |
|
|
| def get_per_token_prob(model, tokenizer, prompt): |
| inputs = tokenizer(prompt, return_tensors="pt") |
| input_ids = inputs.input_ids.to(model.device) |
| with torch.no_grad(): |
| outputs = model(input_ids) |
| logits = outputs.logits |
| |
| probs = torch.softmax(logits, dim=-1) |
| |
| shift_probs = probs[:, :-1, :].contiguous() |
| shift_labels = input_ids[:, 1:].contiguous() |
| |
| token_probs = shift_probs.gather(-1, shift_labels.unsqueeze(-1)).squeeze(-1) |
| token_probs = token_probs.squeeze(0).cpu().numpy() |
| tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0).cpu()) |
| clean_tokens = [token.lstrip('Ġ') for token in tokens[1:]] |
| return clean_tokens, token_probs |
|
|
| def format_prompt(q): |
| |
| return q['question'] |
|
|
| def main(): |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device) |
| model.eval() |
| for subset in DATASETS: |
| print(f"Processing {subset}...") |
| ds = load_dataset("cais/mmlu", subset, split="test") |
| save_path = os.path.join(SAVE_DIR, subset) |
| ensure_dir(save_path) |
| for idx, item in enumerate(ds): |
| options = {chr(65+i): item['choices'][i] for i in range(len(item['choices']))} |
| answer = chr(65 + item['answer']) |
| qdict = {"question": item['question'], "options": options, "answer": answer} |
| prompt = format_prompt(qdict) |
| tokens, per_token_ppl = get_per_token_prob(model, tokenizer, prompt) |
| |
| token_info = [ |
| (i, tokens[i], per_token_ppl[i]) |
| for i in range(len(tokens)) |
| if len(tokens[i]) > 1 and tokens[i].lower() not in STOPWORDS |
| ] |
| if len(token_info) < TOP_K: |
| topk = sorted(token_info, key=lambda x: x[2])[:TOP_K] |
| else: |
| topk = sorted(token_info, key=lambda x: x[2])[:TOP_K] |
| topk_tokens = [x[1] for x in topk] |
| uncertainties = [float(x[2]) for x in topk] |
| out = { |
| "question": qdict["question"], |
| "options": qdict["options"], |
| "answer": qdict["answer"], |
| "topk_tokens": topk_tokens, |
| "uncertainties": uncertainties |
| } |
| fname = os.path.join(save_path, f"question_{idx:04d}.json") |
| with open(fname, 'w', encoding='utf-8') as f: |
| json.dump(out, f, ensure_ascii=False, indent=2) |
| if idx % 20 == 0: |
| print(f"Saved {fname}") |
|
|
| if __name__ == "__main__": |
| main() |