How to Get Started with the Model


from transformers import AutoModel, AutoTokenizer, default_data_collator
from torch.utils.data import Dataset, DataLoader
import torch, json, faiss
import numpy as np
from collections import OrderedDict


class NamesDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)
      
    
def embed_dense(tokenizer, encoder, names, max_length, show_progress=True, use_cuda=True, index=None):
    """
    Embedding data into dense representations

    Parameters
    ----------
    names : np.array or list
        An array of names

    Returns
    -------
    dense_embeds : list
        A list of dense embeddings
    """
    encoder.eval() # prevent dropout
    batch_size=1024
    dense_embeds = []

    if isinstance(names, np.ndarray):
        names = names.tolist()        

    name_encodings = tokenizer(names, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt")

    if use_cuda:
        name_encodings = name_encodings.to('cuda')

    name_dataset = NamesDataset(name_encodings)
    name_dataloader = DataLoader(name_dataset, shuffle=False, collate_fn=default_data_collator, batch_size=batch_size)

    with torch.no_grad():
      for batch in tqdm(name_dataloader, disable=not show_progress, desc='embedding dictionary'):
        outputs = encoder(**batch)
        batch_dense_embeds = outputs[0][:, 0].cpu().detach()#.numpy()
        index.add(batch_dense_embeds.numpy())

    print(f"Total embeddings in index: {index.ntotal}")
    return index


def check_label(predicted_cui, golden_cui):
    """
    Some composite annotation didn't consider orders
    So, set label '1' if any cui is matched within composite cui (or single cui)
    Otherwise, set label '0'
    """
    return int(len(set(predicted_cui.split("|")).intersection(set(golden_cui.split("|"))))>0)


model_path = "./ncbi-disease-sapbert-retriever"
encoder = AutoModel.from_pretrained(model_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if use_cuda:
  encoder = encoder.to("cuda")
  embed_dim = 768

# load terminology
dictionary_path = "path/to/the/kb/with/terms/and/ids"
with open(dictionary_path) as file:
  dictionary = file.readlines()

cui2names = defaultdict(list)
name2cui = dict()
for line in dictionary:
  cui, name = line.split("||")
  name2cui[name.strip()] = cui

eval_dictionary = np.array(list(name2cui.items()))

# create faiss index and embed terminology
index = faiss.index_factory(embed_dim, 'Flat', faiss.METRIC_INNER_PRODUCT)

# Embed the dictionary
index = embed_dense(tokenizer, encoder, list(name2cui.keys()), max_length=50, use_cuda=use_cuda, index=index)

# Retrieve candidate concepts for each query (mention)
k_candidates = 20
candidate_rankings = []
for query in tqdm(queries_list):
    
    mention_embedding = embed_dense(tokenizer, encoder, query["entity_text"], max_length=50, show_progress=False)[0]
    # Search index
    dists, candidate_idxs = index.search(mention_embedding, k=k_candidates)
    ranked_lists = []
    for query_candidate_idx in candidate_idxs:

        candidates = eval_dictionary[query_candidate_idx].squeeze() # ranked candidate list
    
        # Deduplicate CUIs while preserving ranking order
        golden_cui = query["cui"]
        unique_candidates = OrderedDict() 
        for name, cui in candidates:
            if cui not in unique_candidates:
                unique_candidates[cui] = {
                    "name": name,
                    "cui": cui,
                    "label": check_label(cui, golden_cui)
                }
        ranked_lists.append(unique_candidates)

    candidate_rankings.append(ranked_lists)
Downloads last month
7
Safetensors
Model size
0.1B params
Tensor type
F32
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support