MAS-AI-0000's picture
Upload 6 files
3cdaafb verified
raw
history blame
4.71 kB
import argparse
import json
import os
from pathlib import Path
from typing import Sequence
import numpy as np; np.random.seed(43)
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from ..utils import evaluate_metrics
os.environ["TOKENIZERS_PARALLELISM"] = "true"
_LOG_PATH = Path(__file__).resolve().parents[3] / "runs" / "val-other_detector.txt"
_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
class PassagesDataset(Dataset):
def __init__(self, data):
self.passages = data
def __len__(self):
return len(self.passages)
def __getitem__(self, idx):
data_now = self.passages[idx]
text = data_now['text']
label = int(data_now['label'])==0
ids = data_now['id']
return text, int(label), int(ids)
def load_jsonl(file_path,need_human=True):
out = []
with open(file_path, mode='r', encoding='utf-8') as jsonl_file:
for line in jsonl_file:
item = json.loads(line)
if item['src'] =='human' and need_human==False:
continue
out.append(item)
print(f"Loaded {len(out)} examples from {file_path}")
return out
def dict2str(metrics):
out_str=''
for key in metrics.keys():
out_str+=f"{key}:{metrics[key]} "
return out_str
def gen_embeddings(data, model, tokenizer):
device = torch.device("cuda")
dataset = PassagesDataset(data)
dataloder = DataLoader(dataset, batch_size=32, num_workers=8, shuffle=False)
labels, embeddings = [], []
with torch.no_grad():
for batch in tqdm(dataloder,total=len(dataloder)):
texts,label,ids= batch
encoded_batch = tokenizer.batch_encode_plus(
texts,
return_tensors="pt",
max_length=512,
padding='max_length',
truncation=True,
)
for key in encoded_batch:
encoded_batch[key] = encoded_batch[key].unsqueeze(1).to(device)
now_embeddings = model(**encoded_batch)
now_embeddings = F.normalize(now_embeddings, p=2, dim=-1)
embeddings.append(now_embeddings.cpu())
labels.append(label.cpu())
labels = torch.cat(labels, dim=0).numpy()
embeddings = torch.cat(embeddings, dim=0).numpy()
return embeddings, labels
def run(opt):
device = torch.device("cuda")
model = AutoModel.from_pretrained("rrivera1849/LUAR-CRUD", trust_remote_code=True)
model.to(device)
model.eval()
tokenizer = AutoTokenizer.from_pretrained("rrivera1849/LUAR-CRUD")
database_data = load_jsonl(opt.database_path,need_human=False)
test_data = load_jsonl(opt.test_dataset_path)
print("Database Data Size:", len(database_data), "Test Data Size:", len(test_data))
database_embeddings, database_labels = gen_embeddings(database_data, model, tokenizer)
test_embeddings, test_labels = gen_embeddings(test_data, model, tokenizer)
dis = test_embeddings @ database_embeddings.T
dis = dis.min(axis=1)
metric = evaluate_metrics(test_labels, dis)
print(dict2str(metric))
with _LOG_PATH.open("a+", encoding="utf-8") as f:
f.write(f"UAR {opt.test_dataset_path}\n")
f.write(f"{dict2str(metric)}\n")
def build_argument_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser()
parser.add_argument("--database_path", type=str, default="/path/to/RealBench/MAGE_Unseen/Unseen/5shot/train_0.jsonl")
parser.add_argument("--test_dataset_path", type=str, default="/path/to/RealBench/MAGE_Unseen/Unseen/5shot/test_0.jsonl")
return parser
def main(argv: Sequence[str] | None = None) -> None:
parser = build_argument_parser()
opt = parser.parse_args(argv)
run(opt)
if __name__ == "__main__":
main()
# text = ['The quick brown fox jumps over the lazy dog.','There is a cat on the roof.']
# encoded_batch = tokenizer.batch_encode_plus(
# text,
# return_tensors="pt",
# max_length=512,
# padding='max_length',
# truncation=True,
# )
# for key in encoded_batch:
# encoded_batch[key] = encoded_batch[key].unsqueeze(1).to(device)
# with torch.no_grad():
# embeddings = model(**encoded_batch)
# print(embeddings.shape)