| import os |
| import sys |
| import json |
| import torch |
| import numpy as np |
| import evaluate |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, set_seed |
| from datasets import load_dataset |
| from tqdm import tqdm |
|
|
| |
| |
|
|
| |
| if len(sys.argv) > 1: |
| seed = int(sys.argv[1]) |
| try: |
| lang = sys.argv[2] |
| except IndexError: |
| lang = "en" |
| else: |
| seed = 42 |
| lang = "en" |
|
|
| |
| set_seed(seed) |
|
|
| result = {} |
| result["seed"] = seed |
| result["type"] = "no_finetune_baseline" |
|
|
| |
| model_checkpoint = "gpt2" |
| tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| |
| model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2) |
| model.config.pad_token_id = model.config.eos_token_id |
|
|
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| model.to(device) |
| model.eval() |
|
|
| |
| def tokenize_short_function(example): |
| return tokenizer( |
| example["sentence1"], |
| example["sentence2"], |
| truncation=True, |
| max_length=256, |
| padding="max_length" |
| ) |
|
|
| def tokenize_full_function(example): |
| return tokenizer( |
| example["sentence1"], |
| example["sentence2"], |
| truncation=True, |
| max_length=512, |
| padding="max_length" |
| ) |
|
|
| |
| def run_inference(test_dataset, batch_size=64): |
| preds = [] |
| labels = [] |
| |
| |
| for i in tqdm(range(0, len(test_dataset), batch_size), desc="Predicting", disable=True): |
| batch = test_dataset[i : i + batch_size] |
| |
| inputs = { |
| "input_ids": torch.tensor(batch["input_ids"]).to(device), |
| "attention_mask": torch.tensor(batch["attention_mask"]).to(device), |
| } |
| batch_labels = batch["label"] |
|
|
| with torch.no_grad(): |
| outputs = model(**inputs) |
| batch_preds = torch.argmax(outputs.logits, axis=-1).cpu().numpy() |
|
|
| preds.extend(batch_preds) |
| labels.extend(batch_labels) |
| |
| metric = evaluate.load("glue", "mrpc") |
| return metric.compute(predictions=preds, references=labels) |
|
|
| |
| |
| |
| raw_datasets_short = load_dataset('dnagpt/biopaws', 'protein_pair_short')['train'].train_test_split(test_size=0.3, seed=seed) |
|
|
| |
| tokenized_raw_datasets_short = raw_datasets_short.map(tokenize_short_function, batched=True, num_proc=4) |
| ret_1 = run_inference(tokenized_raw_datasets_short["test"]) |
| result["protein_pair_short"] = ret_1 |
|
|
|
|
| |
| |
| |
| raw_datasets_full = load_dataset('dnagpt/biopaws', 'protein_pair_full')['train'].train_test_split(test_size=0.3, seed=seed) |
|
|
| |
| tokenized_raw_datasets_full = raw_datasets_full.map(tokenize_full_function, batched=True, num_proc=4) |
| ret_2 = run_inference(tokenized_raw_datasets_full["test"]) |
| result["protein_pair_full"] = ret_2 |
|
|
|
|
| |
| |
| |
| print(json.dumps(result)) |