How to Get Started with the Model
from transformers import AutoModel, AutoTokenizer, default_data_collator
from torch.utils.data import Dataset, DataLoader
import torch, json, faiss
import numpy as np
from collections import OrderedDict
class NamesDataset(Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
def __len__(self):
return len(self.encodings.input_ids)
def embed_dense(tokenizer, encoder, names, max_length, show_progress=True, use_cuda=True, index=None):
"""
Embedding data into dense representations
Parameters
----------
names : np.array or list
An array of names
Returns
-------
dense_embeds : list
A list of dense embeddings
"""
encoder.eval() # prevent dropout
batch_size=1024
dense_embeds = []
if isinstance(names, np.ndarray):
names = names.tolist()
name_encodings = tokenizer(names, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt")
if use_cuda:
name_encodings = name_encodings.to('cuda')
name_dataset = NamesDataset(name_encodings)
name_dataloader = DataLoader(name_dataset, shuffle=False, collate_fn=default_data_collator, batch_size=batch_size)
with torch.no_grad():
for batch in tqdm(name_dataloader, disable=not show_progress, desc='embedding dictionary'):
outputs = encoder(**batch)
batch_dense_embeds = outputs[0][:, 0].cpu().detach()#.numpy()
index.add(batch_dense_embeds.numpy())
print(f"Total embeddings in index: {index.ntotal}")
return index
def check_label(predicted_cui, golden_cui):
"""
Some composite annotation didn't consider orders
So, set label '1' if any cui is matched within composite cui (or single cui)
Otherwise, set label '0'
"""
return int(len(set(predicted_cui.split("|")).intersection(set(golden_cui.split("|"))))>0)
model_path = "./ncbi-disease-sapbert-retriever"
encoder = AutoModel.from_pretrained(model_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if use_cuda:
encoder = encoder.to("cuda")
embed_dim = 768
# load terminology
dictionary_path = "path/to/the/kb/with/terms/and/ids"
with open(dictionary_path) as file:
dictionary = file.readlines()
cui2names = defaultdict(list)
name2cui = dict()
for line in dictionary:
cui, name = line.split("||")
name2cui[name.strip()] = cui
eval_dictionary = np.array(list(name2cui.items()))
# create faiss index and embed terminology
index = faiss.index_factory(embed_dim, 'Flat', faiss.METRIC_INNER_PRODUCT)
# Embed the dictionary
index = embed_dense(tokenizer, encoder, list(name2cui.keys()), max_length=50, use_cuda=use_cuda, index=index)
# Retrieve candidate concepts for each query (mention)
k_candidates = 20
candidate_rankings = []
for query in tqdm(queries_list):
mention_embedding = embed_dense(tokenizer, encoder, query["entity_text"], max_length=50, show_progress=False)[0]
# Search index
dists, candidate_idxs = index.search(mention_embedding, k=k_candidates)
ranked_lists = []
for query_candidate_idx in candidate_idxs:
candidates = eval_dictionary[query_candidate_idx].squeeze() # ranked candidate list
# Deduplicate CUIs while preserving ranking order
golden_cui = query["cui"]
unique_candidates = OrderedDict()
for name, cui in candidates:
if cui not in unique_candidates:
unique_candidates[cui] = {
"name": name,
"cui": cui,
"label": check_label(cui, golden_cui)
}
ranked_lists.append(unique_candidates)
candidate_rankings.append(ranked_lists)
- Downloads last month
- 7
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support