| from transformers import AutoTokenizer, AutoModelForMaskedLM | |
| import torch | |
| import math | |
| MODEL_NAME = "microsoft/deberta-v3-base" | |
| print(f"Loading {MODEL_NAME}...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME) | |
| model.eval() | |
| text = "This is a simple sentence written by a human." | |
| inputs = tokenizer(text, return_tensors="pt") | |
| labels = inputs.input_ids.clone() | |
| with torch.no_grad(): | |
| outputs = model(inputs.input_ids, labels=labels) | |
| loss = outputs.loss | |
| ppl = torch.exp(loss) | |
| print(f"Text: {text}") | |
| print(f"Loss: {loss.item()}") | |
| print(f"Perplexity: {ppl.item()}") | |
| if ppl.item() < 1.5: | |
| print("\nWARNING: Perplexity is extremely low (~1.0).") | |
| print("This indicates the model is predicting tokens it can already see (Identity/Auto-encoder behavior).") | |
| print("For proper PPL/Pseudo-PPL, we must mask the input tokens.") | |