File size: 2,765 Bytes
171eb01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

# Minimal script to test the NEW logic inside views.py
# We will import the function from views (or copy it to be safe/clean)
# For now, let's copy the logic here to verify it without Django overhead, 
# then we trust views.py works.

import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import math

MODEL_NAME = "roberta-base"
print(f"Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def calculate_perplexity(text):
    encodings = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    input_ids = encodings.input_ids.to(device)
    seq_len = input_ids.shape[1]
    
    if seq_len < 2: return 100.0

    nlls = []
    BATCH_SIZE = 8 # Small batch for GTX 1650
    
    tensor_input_ids = input_ids.repeat(BATCH_SIZE, 1) 
    start_idx = 1
    end_idx = seq_len - 1
    loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
    
    print(f"Processing {seq_len} tokens with Batch Size {BATCH_SIZE}...")
    
    for i in range(start_idx, end_idx, BATCH_SIZE):
        current_batch_size = min(BATCH_SIZE, end_idx - i)
        batch_input_ids = tensor_input_ids[:current_batch_size].clone()
        batch_labels = torch.full(batch_input_ids.shape, -100).to(device)
        
        for j in range(current_batch_size):
            token_idx_to_mask = i + j
            batch_labels[j, token_idx_to_mask] = batch_input_ids[j, token_idx_to_mask].item()
            batch_input_ids[j, token_idx_to_mask] = tokenizer.mask_token_id
            
        with torch.no_grad():
            outputs = model(batch_input_ids)
            predictions = outputs.logits
            
        predictions = predictions.permute(0, 2, 1)
        loss = loss_fct(predictions, batch_labels)
        masked_losses = loss.sum(dim=1)
        nlls.append(masked_losses)
        
    if not nlls: return 0.0
    all_nlls = torch.cat(nlls)
    mean_nll = all_nlls.mean()
    ppl = torch.exp(mean_nll)
    return ppl.item()

# Test Cases
# Complex/Chaotic Human text
human_text = "The specific nuance of that joke totally flew over my head, causing a bit of an awkward silence at the dinner table that lasted for what felt like an eternity."
# Generic/Wikipedia-style AI text
ai_text = "Artificial Intelligence is a branch of computer science that involves the development of systems capable of performing tasks deemed intelligent."

print("\n--- Testing Human Text ---")
ppl_human = calculate_perplexity(human_text)
print(f"Human PPL: {ppl_human:.2f}")

print("\n--- Testing AI Text ---")
ppl_ai = calculate_perplexity(ai_text)
print(f"AI PPL: {ppl_ai:.2f}")