How to Get Started with the Model
from transformers import AutoModel, AutoTokenizer, default_data_collator
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
from tqdm import tqdm
model_name_or_path = "Dash00/retriever-SapBERT-mondo"
encoder = AutoModel.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
use_cuda = torch.cuda.is_available()
if use_cuda:
encoder = encoder.cuda()
max_length = 128
batch_size = 16
show_progress = True
names = ["covid-19", "Coronavirus infection", "high fever", "Tumor of posterior wall of oropharynx"]
name_encodings = tokenizer(names, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt")
if use_cuda:
name_encodings = {k: v.cuda() for k, v in name_encodings.items()}
class NamesDataset(Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __len__(self):
return self.encodings['input_ids'].shape[0]
def __getitem__(self, idx):
return {k: v[idx] for k, v in self.encodings.items()}
name_dataset = NamesDataset(name_encodings)
name_dataloader = DataLoader(name_dataset, shuffle=False, collate_fn=default_data_collator, batch_size=batch_size)
dense_embeds = []
encoder.eval()
with torch.no_grad():
for batch in tqdm(name_dataloader, disable=not show_progress, desc='embedding dictionary'):
outputs = encoder(**batch)
batch_dense_embeds = outputs.last_hidden_state[:, 0].cpu().numpy()
dense_embeds.append(batch_dense_embeds)
dense_embeds = np.concatenate(dense_embeds, axis=0)