File size: 4,016 Bytes

a80f6e6

import os
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np

MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
DATASETS = ["abstract_algebra", "college_mathematics"]
TOP_K = 5
SAVE_DIR = "data/prob_method"

STOPWORDS = {
    "that", "of", "the", "and", "to", "in", "for", "with", "on", "at", "by", "from", "as", "is", "are", "was", "were",
    "be", "been", "being", "a", "an", "but", "or", "if", "because", "so", "do", "does", "did", "not", "no", "yes",
    "can", "could", "should", "would", "will", "shall", "may", "might", "must", "this", "that", "these", "those",
    "it", "its", "he", "she", "they", "them", "his", "her", "their", "you", "your", "we", "our", "i", "me", "my",
    "mine", "who", "whom", "which", "what", "when", "where", "why", "how", "also", "than", "then", "there", "here",
    "such", "other", "some", "any", "all", "each", "every", "either", "neither", "both", "few", "more", "most", "much", "many"
}

def ensure_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def get_per_token_prob(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs.input_ids.to(model.device)
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
        # 对每个位置，softmax得到概率
        probs = torch.softmax(logits, dim=-1)
        # 取每个位置真实token的概率
        shift_probs = probs[:, :-1, :].contiguous()
        shift_labels = input_ids[:, 1:].contiguous()
        # gather出每个真实token的概率
        token_probs = shift_probs.gather(-1, shift_labels.unsqueeze(-1)).squeeze(-1)
        token_probs = token_probs.squeeze(0).cpu().numpy()
    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0).cpu())
    clean_tokens = [token.lstrip('Ġ') for token in tokens[1:]]
    return clean_tokens, token_probs  # skip the first token (BOS)

def format_prompt(q):
    # Only use the question itself as prompt
    return q['question']

def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
    model.eval()
    for subset in DATASETS:
        print(f"Processing {subset}...")
        ds = load_dataset("cais/mmlu", subset, split="test")
        save_path = os.path.join(SAVE_DIR, subset)
        ensure_dir(save_path)
        for idx, item in enumerate(ds):
            options = {chr(65+i): item['choices'][i] for i in range(len(item['choices']))}
            answer = chr(65 + item['answer'])
            qdict = {"question": item['question'], "options": options, "answer": answer}
            prompt = format_prompt(qdict)
            tokens, per_token_ppl = get_per_token_prob(model, tokenizer, prompt)
            # 过滤掉长度为1和stopwords的token
            token_info = [
                (i, tokens[i], per_token_ppl[i])
                for i in range(len(tokens))
                if len(tokens[i]) > 1 and tokens[i].lower() not in STOPWORDS
            ]
            if len(token_info) < TOP_K:
                topk = sorted(token_info, key=lambda x: x[2])[:TOP_K]
            else:
                topk = sorted(token_info, key=lambda x: x[2])[:TOP_K]
            topk_tokens = [x[1] for x in topk]
            uncertainties = [float(x[2]) for x in topk]
            out = {
                "question": qdict["question"],
                "options": qdict["options"],
                "answer": qdict["answer"],
                "topk_tokens": topk_tokens,
                "uncertainties": uncertainties
            }
            fname = os.path.join(save_path, f"question_{idx:04d}.json")
            with open(fname, 'w', encoding='utf-8') as f:
                json.dump(out, f, ensure_ascii=False, indent=2)
            if idx % 20 == 0:
                print(f"Saved {fname}")

if __name__ == "__main__":
    main()