Spaces:

vivek1192
/

detectAI

Sleeping

App Files Files Community

detectAI / backend /test_verify_ppl.py

vivek1192

Setup CI/CD for Hugging Face

171eb01 21 days ago

raw

history blame contribute delete

2.77 kB


	# Minimal script to test the NEW logic inside views.py
	# We will import the function from views (or copy it to be safe/clean)
	# For now, let's copy the logic here to verify it without Django overhead,
	# then we trust views.py works.

	import torch
	from transformers import AutoTokenizer, AutoModelForMaskedLM
	import math

	MODEL_NAME = "roberta-base"
	print(f"Loading {MODEL_NAME}...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)
	model.eval()

	def calculate_perplexity(text):
	encodings = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
	input_ids = encodings.input_ids.to(device)
	seq_len = input_ids.shape[1]

	if seq_len < 2: return 100.0

	nlls = []
	BATCH_SIZE = 8 # Small batch for GTX 1650

	tensor_input_ids = input_ids.repeat(BATCH_SIZE, 1)
	start_idx = 1
	end_idx = seq_len - 1
	loss_fct = torch.nn.CrossEntropyLoss(reduction='none')

	print(f"Processing {seq_len} tokens with Batch Size {BATCH_SIZE}...")

	for i in range(start_idx, end_idx, BATCH_SIZE):
	current_batch_size = min(BATCH_SIZE, end_idx - i)
	batch_input_ids = tensor_input_ids[:current_batch_size].clone()
	batch_labels = torch.full(batch_input_ids.shape, -100).to(device)

	for j in range(current_batch_size):
	token_idx_to_mask = i + j
	batch_labels[j, token_idx_to_mask] = batch_input_ids[j, token_idx_to_mask].item()
	batch_input_ids[j, token_idx_to_mask] = tokenizer.mask_token_id

	with torch.no_grad():
	outputs = model(batch_input_ids)
	predictions = outputs.logits

	predictions = predictions.permute(0, 2, 1)
	loss = loss_fct(predictions, batch_labels)
	masked_losses = loss.sum(dim=1)
	nlls.append(masked_losses)

	if not nlls: return 0.0
	all_nlls = torch.cat(nlls)
	mean_nll = all_nlls.mean()
	ppl = torch.exp(mean_nll)
	return ppl.item()

	# Test Cases
	# Complex/Chaotic Human text
	human_text = "The specific nuance of that joke totally flew over my head, causing a bit of an awkward silence at the dinner table that lasted for what felt like an eternity."
	# Generic/Wikipedia-style AI text
	ai_text = "Artificial Intelligence is a branch of computer science that involves the development of systems capable of performing tasks deemed intelligent."

	print("\n--- Testing Human Text ---")
	ppl_human = calculate_perplexity(human_text)
	print(f"Human PPL: {ppl_human:.2f}")

	print("\n--- Testing AI Text ---")
	ppl_ai = calculate_perplexity(ai_text)
	print(f"AI PPL: {ppl_ai:.2f}")