Spaces:
Running
Running
| import argparse | |
| import json | |
| import os | |
| from pathlib import Path | |
| from typing import Sequence | |
| import numpy as np; np.random.seed(43) | |
| import torch | |
| import torch.nn.functional as F | |
| from torch.utils.data import DataLoader, Dataset | |
| from tqdm import tqdm | |
| from transformers import AutoModel, AutoTokenizer | |
| from ..utils import evaluate_metrics | |
| os.environ["TOKENIZERS_PARALLELISM"] = "true" | |
| _LOG_PATH = Path(__file__).resolve().parents[3] / "runs" / "val-other_detector.txt" | |
| _LOG_PATH.parent.mkdir(parents=True, exist_ok=True) | |
| class PassagesDataset(Dataset): | |
| def __init__(self, data): | |
| self.passages = data | |
| def __len__(self): | |
| return len(self.passages) | |
| def __getitem__(self, idx): | |
| data_now = self.passages[idx] | |
| text = data_now['text'] | |
| label = int(data_now['label'])==0 | |
| ids = data_now['id'] | |
| return text, int(label), int(ids) | |
| def load_jsonl(file_path,need_human=True): | |
| out = [] | |
| with open(file_path, mode='r', encoding='utf-8') as jsonl_file: | |
| for line in jsonl_file: | |
| item = json.loads(line) | |
| if item['src'] =='human' and need_human==False: | |
| continue | |
| out.append(item) | |
| print(f"Loaded {len(out)} examples from {file_path}") | |
| return out | |
| def dict2str(metrics): | |
| out_str='' | |
| for key in metrics.keys(): | |
| out_str+=f"{key}:{metrics[key]} " | |
| return out_str | |
| def gen_embeddings(data, model, tokenizer): | |
| device = torch.device("cuda") | |
| dataset = PassagesDataset(data) | |
| dataloder = DataLoader(dataset, batch_size=32, num_workers=8, shuffle=False) | |
| labels, embeddings = [], [] | |
| with torch.no_grad(): | |
| for batch in tqdm(dataloder,total=len(dataloder)): | |
| texts,label,ids= batch | |
| encoded_batch = tokenizer.batch_encode_plus( | |
| texts, | |
| return_tensors="pt", | |
| max_length=512, | |
| padding='max_length', | |
| truncation=True, | |
| ) | |
| for key in encoded_batch: | |
| encoded_batch[key] = encoded_batch[key].unsqueeze(1).to(device) | |
| now_embeddings = model(**encoded_batch) | |
| now_embeddings = F.normalize(now_embeddings, p=2, dim=-1) | |
| embeddings.append(now_embeddings.cpu()) | |
| labels.append(label.cpu()) | |
| labels = torch.cat(labels, dim=0).numpy() | |
| embeddings = torch.cat(embeddings, dim=0).numpy() | |
| return embeddings, labels | |
| def run(opt): | |
| device = torch.device("cuda") | |
| model = AutoModel.from_pretrained("rrivera1849/LUAR-CRUD", trust_remote_code=True) | |
| model.to(device) | |
| model.eval() | |
| tokenizer = AutoTokenizer.from_pretrained("rrivera1849/LUAR-CRUD") | |
| database_data = load_jsonl(opt.database_path,need_human=False) | |
| test_data = load_jsonl(opt.test_dataset_path) | |
| print("Database Data Size:", len(database_data), "Test Data Size:", len(test_data)) | |
| database_embeddings, database_labels = gen_embeddings(database_data, model, tokenizer) | |
| test_embeddings, test_labels = gen_embeddings(test_data, model, tokenizer) | |
| dis = test_embeddings @ database_embeddings.T | |
| dis = dis.min(axis=1) | |
| metric = evaluate_metrics(test_labels, dis) | |
| print(dict2str(metric)) | |
| with _LOG_PATH.open("a+", encoding="utf-8") as f: | |
| f.write(f"UAR {opt.test_dataset_path}\n") | |
| f.write(f"{dict2str(metric)}\n") | |
| def build_argument_parser() -> argparse.ArgumentParser: | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--database_path", type=str, default="/path/to/RealBench/MAGE_Unseen/Unseen/5shot/train_0.jsonl") | |
| parser.add_argument("--test_dataset_path", type=str, default="/path/to/RealBench/MAGE_Unseen/Unseen/5shot/test_0.jsonl") | |
| return parser | |
| def main(argv: Sequence[str] | None = None) -> None: | |
| parser = build_argument_parser() | |
| opt = parser.parse_args(argv) | |
| run(opt) | |
| if __name__ == "__main__": | |
| main() | |
| # text = ['The quick brown fox jumps over the lazy dog.','There is a cat on the roof.'] | |
| # encoded_batch = tokenizer.batch_encode_plus( | |
| # text, | |
| # return_tensors="pt", | |
| # max_length=512, | |
| # padding='max_length', | |
| # truncation=True, | |
| # ) | |
| # for key in encoded_batch: | |
| # encoded_batch[key] = encoded_batch[key].unsqueeze(1).to(device) | |
| # with torch.no_grad(): | |
| # embeddings = model(**encoded_batch) | |
| # print(embeddings.shape) | |