Authentica / detree /utils /detectors /binoculars_detector.py
MAS-AI-0000's picture
Upload 6 files
3cdaafb verified
raw
history blame
7.47 kB
from typing import Union
import os
import numpy as np
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import transformers
ce_loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
softmax_fn = torch.nn.Softmax(dim=-1)
torch.set_grad_enabled(False)
huggingface_config = {
# Only required for private models from Huggingface (e.g. LLaMA models)
"TOKEN": os.environ.get("HF_TOKEN", None)
}
# selected using Falcon-7B and Falcon-7B-Instruct at bfloat16
BINOCULARS_ACCURACY_THRESHOLD = 0.9015310749276843 # optimized for f1-score
BINOCULARS_FPR_THRESHOLD = 0.8536432310785527 # optimized for low-fpr [chosen at 0.01%]
DEVICE_1 = "cuda:0" if torch.cuda.is_available() else "cpu"
DEVICE_2 = "cuda:1" if torch.cuda.device_count() > 1 else DEVICE_1
def assert_tokenizer_consistency(model_id_1, model_id_2):
identical_tokenizers = (
AutoTokenizer.from_pretrained(model_id_1).vocab
== AutoTokenizer.from_pretrained(model_id_2).vocab
)
if not identical_tokenizers:
raise ValueError(f"Tokenizers are not identical for {model_id_1} and {model_id_2}.")
def perplexity(encoding: transformers.BatchEncoding,
logits: torch.Tensor,
median: bool = False,
temperature: float = 1.0):
shifted_logits = logits[..., :-1, :].contiguous() / temperature
shifted_labels = encoding.input_ids[..., 1:].contiguous()
shifted_attention_mask = encoding.attention_mask[..., 1:].contiguous()
if median:
ce_nan = (ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels).
masked_fill(~shifted_attention_mask.bool(), float("nan")))
ppl = np.nanmedian(ce_nan.cpu().float().numpy(), 1)
else:
ppl = (ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels) *
shifted_attention_mask).sum(1) / shifted_attention_mask.sum(1)
ppl = ppl.to("cpu").float().numpy()
return ppl
def entropy(p_logits: torch.Tensor,
q_logits: torch.Tensor,
encoding: transformers.BatchEncoding,
pad_token_id: int,
median: bool = False,
sample_p: bool = False,
temperature: float = 1.0):
vocab_size = p_logits.shape[-1]
total_tokens_available = q_logits.shape[-2]
p_scores, q_scores = p_logits / temperature, q_logits / temperature
p_proba = softmax_fn(p_scores).view(-1, vocab_size)
if sample_p:
p_proba = torch.multinomial(p_proba.view(-1, vocab_size), replacement=True, num_samples=1).view(-1)
q_scores = q_scores.view(-1, vocab_size)
ce = ce_loss_fn(input=q_scores, target=p_proba).view(-1, total_tokens_available)
padding_mask = (encoding.input_ids != pad_token_id).type(torch.uint8)
if median:
ce_nan = ce.masked_fill(~padding_mask.bool(), float("nan"))
agg_ce = np.nanmedian(ce_nan.cpu().float().numpy(), 1)
else:
agg_ce = (((ce * padding_mask).sum(1) / padding_mask.sum(1)).to("cpu").float().numpy())
return agg_ce
class Binoculars(object):
def __init__(self,
observer_name_or_path: str = "tiiuae/falcon-7b",
performer_name_or_path: str = "tiiuae/falcon-7b-instruct",
use_bfloat16: bool = True,
max_token_observed: int = 512,
mode: str = "low-fpr",
) -> None:
assert_tokenizer_consistency(observer_name_or_path, performer_name_or_path)
self.change_mode(mode)
self.observer_model = AutoModelForCausalLM.from_pretrained(observer_name_or_path,
device_map={"": DEVICE_1},
trust_remote_code=True,
torch_dtype=torch.bfloat16 if use_bfloat16
else torch.float32,
token=huggingface_config["TOKEN"]
)
self.performer_model = AutoModelForCausalLM.from_pretrained(performer_name_or_path,
device_map={"": DEVICE_2},
trust_remote_code=True,
torch_dtype=torch.bfloat16 if use_bfloat16
else torch.float32,
token=huggingface_config["TOKEN"]
)
self.observer_model.eval()
self.performer_model.eval()
self.tokenizer = AutoTokenizer.from_pretrained(observer_name_or_path)
if not self.tokenizer.pad_token:
self.tokenizer.pad_token = self.tokenizer.eos_token
self.max_token_observed = max_token_observed
def change_mode(self, mode: str) -> None:
if mode == "low-fpr":
self.threshold = BINOCULARS_FPR_THRESHOLD
elif mode == "accuracy":
self.threshold = BINOCULARS_ACCURACY_THRESHOLD
else:
raise ValueError(f"Invalid mode: {mode}")
def _tokenize(self, batch: list[str]) -> transformers.BatchEncoding:
batch_size = len(batch)
encodings = self.tokenizer(
batch,
return_tensors="pt",
padding="longest" if batch_size > 1 else False,
truncation=True,
max_length=self.max_token_observed,
return_token_type_ids=False).to(self.observer_model.device)
return encodings
@torch.inference_mode()
def _get_logits(self, encodings: transformers.BatchEncoding) -> torch.Tensor:
observer_logits = self.observer_model(**encodings.to(DEVICE_1)).logits
performer_logits = self.performer_model(**encodings.to(DEVICE_2)).logits
if DEVICE_1 != "cpu":
torch.cuda.synchronize()
return observer_logits, performer_logits
def compute_score(self, input_text: Union[list[str], str]) -> Union[float, list[float]]:
batch = [input_text] if isinstance(input_text, str) else input_text
encodings = self._tokenize(batch)
observer_logits, performer_logits = self._get_logits(encodings)
ppl = perplexity(encodings, performer_logits)
x_ppl = entropy(observer_logits.to(DEVICE_1), performer_logits.to(DEVICE_1),
encodings.to(DEVICE_1), self.tokenizer.pad_token_id)
binoculars_scores = ppl / x_ppl
binoculars_scores = binoculars_scores.tolist()
return binoculars_scores[0] if isinstance(input_text, str) else binoculars_scores
def predict(self, input_text: Union[list[str], str]) -> Union[list[str], str]:
binoculars_scores = np.array(self.compute_score(input_text))
pred = np.where(binoculars_scores < self.threshold,
"Most likely AI-generated",
"Most likely human-generated"
).tolist()
return pred