alenusch
/

par_cls_bert

+## Classifier to check if two sequences are paraphrase or not
+Trained based on ruBert by DeepPavlov.
+Use this way:
+```
+import torch
+import torch.nn as nn
+import os
+import copy
+import random
+import numpy as np
+import pandas as pd
+from torch.utils.data import DataLoader, Dataset
+from torch.cuda.amp import autocast, GradScaler
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
+from transformers.file_utils import (
+    cached_path,
+    hf_bucket_url,
+    is_remote_url,
+)
+archive_file = hf_bucket_url(
+                "alenusch/par_cls_bert",
+                filename="rubert-base-cased_lr_2e-05_val_loss_0.66143_ep_4.pt",
+                revision=None,
+                mirror=None,
+            )
+resolved_archive_file = cached_path(
+                archive_file,
+                cache_dir=None,
+                force_download=False,
+                proxies=None,
+                resume_download=False,
+                local_files_only=False,
+            )
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+class SentencePairClassifier(nn.Module):
+    def __init__(self, bert_model):
+        super(SentencePairClassifier, self).__init__()
+        self.bert_layer = AutoModel.from_pretrained(bert_model)
+        self.cls_layer = nn.Linear(768, 1)
+        self.dropout = nn.Dropout(p=0.1)
+    @autocast()
+    def forward(self, input_ids, attn_masks, token_type_ids):
+        cont_reps, pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids,  return_dict=False)
+        logits = self.cls_layer(self.dropout(pooler_output))
+        return logits
+class CustomDataset(Dataset):
+    def __init__(self, data, maxlen, bert_model):
+        self.data = data
+        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)
+        self.maxlen = maxlen
+        self.targets = False
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, index):
+        sent1 = str(self.data[index][0])
+        sent2 = str(self.data[index][1])
+        encoded_pair = self.tokenizer(sent1, sent2,
+                                      padding='max_length',  # Pad to max_length
+                                      truncation=True,  # Truncate to max_length
+                                      max_length=self.maxlen,
+                                      return_tensors='pt')  # Return torch.Tensor objects
+        token_ids = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
+        attn_masks = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
+        token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens
+        return token_ids, attn_masks, token_type_ids
+def get_probs_from_logits(logits):
+    probs = torch.sigmoid(logits.unsqueeze(-1))
+    return probs.detach().cpu().numpy()
+def test_prediction(net, device, dataloader, with_labels=False):
+    net.eval()
+    probs_all = []
+    with torch.no_grad():
+        for seq, attn_masks, token_type_ids in tqdm(dataloader):
+                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
+                logits = net(seq, attn_masks, token_type_ids)
+                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
+                probs_all += probs.tolist()
+    return probs_all
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+cls_model = SentencePairClassifier(bert_model="alenusch/par_cls_bert")
+if torch.cuda.device_count() > 1:
+    cls_model = nn.DataParallel(model)
+cls_model.load_state_dict(torch.load(resolved_archive_file))
+cls_model.to(device)
+variants = [["sentence1", "sentence2"]]
+test_set = CustomDataset(variants, maxlen=512, bert_model="alenusch/par_cls_bert")
+test_loader = DataLoader(test_set, batch_size=16, num_workers=5)
+res = test_prediction(net=cls_model, device=device, dataloader=test_loader, with_labels=False)
+```