| import os |
| import torch |
| from typing import Any, Dict, List |
| from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM |
|
|
| |
| os.environ["HF_HUB_TRUST_REMOTE_CODE"] = "True" |
|
|
| class EndpointHandler: |
| def __init__(self, path=""): |
| |
| self.model_id = "InstaDeepAI/nucleotide-transformer-v2-50m-multi-species" |
| |
| |
| self.config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True) |
| self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True) |
| self.model = AutoModelForMaskedLM.from_pretrained( |
| self.model_id, |
| config=self.config, |
| trust_remote_code=True |
| ) |
| |
| if torch.cuda.is_available(): |
| self.model = self.model.to("cuda") |
| self.model.eval() |
|
|
| def __call__(self, data: Dict[str, Any]) -> List[float]: |
| inputs = data.pop("inputs", data) |
| if isinstance(inputs, list): |
| inputs = inputs[0] |
|
|
| |
| chunk_size = 1000 |
| stride = 500 |
| chunks = [inputs[i:i + chunk_size] for i in range(0, len(inputs), stride)] |
| |
| all_embeddings = [] |
| with torch.no_grad(): |
| for chunk in chunks: |
| tokens = self.tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=chunk_size) |
| if torch.cuda.is_available(): |
| tokens = {k: v.to("cuda") for k, v in tokens.items()} |
| |
| outputs = self.model(**tokens, output_hidden_states=True) |
| chunk_emb = torch.mean(outputs.hidden_states[-1], dim=1).squeeze() |
| all_embeddings.append(chunk_emb) |
|
|
| final_embedding = torch.stack(all_embeddings).mean(dim=0).cpu().numpy().tolist() |
| return final_embedding |