File size: 6,233 Bytes
3a19a3f 21fa57e e2537a6 21fa57e 3a19a3f e2537a6 3a19a3f b24b48a 3a19a3f e2537a6 21fa57e e2537a6 b24b48a 21fa57e e2537a6 b24b48a 21fa57e b24b48a 21fa57e b24b48a 3a19a3f b24b48a 3a19a3f b24b48a 3a19a3f b24b48a 3a19a3f b24b48a 3a19a3f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import torch
import torch.nn as nn
from transformers import PreTrainedTokenizer
from transformers.tokenization_utils_base import BatchEncoding
from transformers import AutoTokenizer, AutoModel
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit import RDLogger
from rdkit.Chem import Draw
import joblib
import numpy as np
import os
from huggingface_hub import snapshot_download
import warnings
from sklearn.exceptions import InconsistentVersionWarning
from torchvision import models, transforms
from PIL import Image
warnings.filterwarnings("ignore", category=InconsistentVersionWarning)
RDLogger.DisableLog('rdApp.*')
class BBBTokenizer(PreTrainedTokenizer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.calc = MoleculeDescriptors.MolecularDescriptorCalculator([i[0] for i in Descriptors.descList])
self.tokenizer = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-100M-MLM')
self.chemberta = AutoModel.from_pretrained('DeepChem/ChemBERTa-100M-MLM').eval()
self.resnet50_backbone = models.resnet50(weights="IMAGENET1K_V1")
self.resnet = nn.Sequential(*list(self.resnet50_backbone.children())[:-1]).eval()
self.img_preprocess = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225],
)
])
self.feature_transformer_tab = None
self.feature_transformer_img = None
self.feature_transformer_txt = None
self.task = None
def generate_tab_features(self, smiles):
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return torch.tensor(self.feature_transformer_tab.n_features_in_, dtype=torch.float32)
rdkit_2d = np.array(self.calc.CalcDescriptors(mol))
rdkit_2d[np.isinf(rdkit_2d)] = np.nan
rdkit_2d = np.nan_to_num(rdkit_2d, nan=0.0, posinf=0.0, neginf=0.0)
maccs = np.array(list(MACCSkeys.GenMACCSKeys(mol).ToBitString()), dtype=int)
tab_input = np.concatenate([rdkit_2d, maccs])
tab_input = self.feature_transformer_tab.transform(tab_input.reshape(1, -1))[0]
tab_input = np.clip(tab_input, -1e5, 1e5)
return torch.tensor(tab_input, dtype=torch.float32)
def generate_img_features(self, smiles):
mol = Chem.MolFromSmiles(smiles)
if mol is None:
img = Image.new("RGB", (300,300), color=(0,0,0))
else:
img = Draw.MolToImage(mol, size=(300, 300))
img = self.img_preprocess(img)
with torch.no_grad():
img_input = self.resnet(img.unsqueeze(0)).squeeze(-1).squeeze(-1)
img_input = self.feature_transformer_img.transform(img_input.reshape(1, -1))[0]
return torch.tensor(img_input, dtype=torch.float32)
def generate_txt_features(self, smiles):
encoded = self.tokenizer(smiles, return_tensors="pt")
with torch.no_grad():
outputs = self.chemberta(**encoded)
hidden_states = outputs.last_hidden_state[0].mean(axis=0).numpy()
txt_input = self.feature_transformer_txt.transform(hidden_states.reshape(1, -1))[0]
return torch.tensor(txt_input, dtype=torch.float32)
def _batch_encode_plus(
self,
batch_smiles: list[str],
task: str = 'classification',
return_tensors: str = "pt",
**kwargs
):
if self.task is None or self.task != task:
if task == 'classification':
model_dir = snapshot_download("SaeedLab/TITAN-BBB", allow_patterns=["normalize_cls_tabular.joblib"])
transformer_tab_path = os.path.join(model_dir, "normalize_cls_tabular.joblib")
model_dir = snapshot_download("SaeedLab/TITAN-BBB", allow_patterns=["normalize_cls_image.joblib"])
transformer_img_path = os.path.join(model_dir, "normalize_cls_image.joblib")
model_dir = snapshot_download("SaeedLab/TITAN-BBB", allow_patterns=["normalize_cls_text.joblib"])
transformer_txt_path = os.path.join(model_dir, "normalize_cls_text.joblib")
self.task = task
elif task == 'regression':
model_dir = snapshot_download("SaeedLab/TITAN-BBB", allow_patterns=["normalize_reg_tabular.joblib"])
transformer_tab_path = os.path.join(model_dir, "normalize_reg_tabular.joblib")
model_dir = snapshot_download("SaeedLab/TITAN-BBB", allow_patterns=["normalize_reg_image.joblib"])
transformer_img_path = os.path.join(model_dir, "normalize_reg_image.joblib")
model_dir = snapshot_download("SaeedLab/TITAN-BBB", allow_patterns=["normalize_reg_text.joblib"])
transformer_txt_path = os.path.join(model_dir, "normalize_reg_text.joblib")
self.task = task
else:
raise ValueError('task not defined')
return
self.feature_transformer_tab = joblib.load(transformer_tab_path)
self.feature_transformer_img = joblib.load(transformer_img_path)
self.feature_transformer_txt = joblib.load(transformer_txt_path)
data_list = []
tab, img, txt = [], [], []
for smiles in batch_smiles:
tab.append(self.generate_tab_features(smiles))
img.append(self.generate_img_features(smiles))
txt.append(self.generate_txt_features(smiles))
tab = torch.stack(tab)
img = torch.stack(img)
txt = torch.stack(txt)
output = {}
output["tab"] = tab
output["img"] = img
output["txt"] = txt
return BatchEncoding(output, tensor_type=return_tensors)
def encode(self,
batch_smiles: list[str],
task: str = 'classification',
return_tensors: str = "pt",
**kwargs):
return self._batch_encode_plus(batch_smiles, task, return_tensors, **kwargs)
def __call__(self,
batch_smiles: list[str],
task: str = 'classification',
return_tensors: str = "pt",
**kwargs):
return self._batch_encode_plus(batch_smiles, task, return_tensors, **kwargs)
def _tokenize(self, text, **kwargs):
return []
def save_vocabulary(self, save_directory, filename_prefix=None):
return ()
def get_vocab(self):
return {"<pad>":0, "<bos>":1, "<eos>":2, "<unk>":3, "<mask>":4}
@property
def vocab_size(self):
return len(self.get_vocab()) |