Upload 13 files

Browse files

Files changed (13) hide show

code/GNN/__init__.py +0 -0
code/GNN/featurizer.py +138 -0
code/GNN/layers.py +443 -0
code/GNN/subgraphfp.py +138 -0
code/GNN/utils.py +240 -0
code/cliplayers.py +432 -0
code/config.py +95 -0
code/dataset.py +142 -0
code/modules.py +158 -0
code/predict.py +347 -0
code/separate_posneg.py +29 -0
code/train.py +251 -0
code/utils.py +370 -0

code/GNN/__init__.py ADDED Viewed

File without changes

code/GNN/featurizer.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import numpy as np
+from rdkit import Chem
+from rdkit.Chem import AllChem, MACCSkeys, rdMolDescriptors as rdDesc
+from utils import *
+import torch
+import copy
+from . import subgraphfp as subfp
+PERIODIC_TABLE  = Chem.GetPeriodicTable()
+POSSIBLE_ATOMS  = ['H', 'C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br','I', 'B']
+HYBRIDS         = [ Chem.rdchem.HybridizationType.SP, Chem.rdchem.HybridizationType.SP2,
+                    Chem.rdchem.HybridizationType.SP3, Chem.rdchem.HybridizationType.SP3D, Chem.rdchem.HybridizationType.SP3D2]
+CHIRALS         = [ Chem.rdchem.ChiralType.CHI_UNSPECIFIED, Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
+                    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, Chem.rdchem.ChiralType.CHI_OTHER]
+BOND_TYPES      = [ Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC ]
+def one_of_k_encoding(x, allowable_set):
+    if x not in allowable_set:
+        raise Exception("input {0} not in allowable set{1}:".format(x, allowable_set))
+    return list(map(lambda s: x == s, allowable_set))
+def one_of_k_encoding_unk(x, allowable_set):
+    """Maps inputs not in the allowable set to the last element."""
+    if x not in allowable_set:
+        x = allowable_set[-1]
+    return list(map(lambda s: x == s, allowable_set))
+def calc_atom_features_onehot(atom, feature):
+    '''
+    Method that computes atom level features from rdkit atom object
+    '''
+    atom_features  = one_of_k_encoding_unk(atom.GetSymbol(), POSSIBLE_ATOMS)
+    atom_features += one_of_k_encoding_unk(atom.GetExplicitValence(), list(range(7)))
+    atom_features += one_of_k_encoding_unk(atom.GetImplicitValence(), list(range(7)))
+    atom_features += one_of_k_encoding_unk(atom.GetTotalNumHs(), list(range(5)))
+    atom_features += one_of_k_encoding_unk(atom.GetNumRadicalElectrons(), list(range(5)))
+    atom_features += one_of_k_encoding_unk(atom.GetTotalDegree(), list(range(7)))
+    atom_features += one_of_k_encoding_unk(atom.GetFormalCharge(), list(range(-2, 3)))
+    atom_features += one_of_k_encoding_unk(atom.GetHybridization(), HYBRIDS)
+    atom_features += one_of_k_encoding_unk(atom.GetIsAromatic(), [False, True])
+    atom_features += one_of_k_encoding_unk(atom.IsInRing(), [False, True])
+    atom_features += one_of_k_encoding_unk(atom.GetChiralTag(), CHIRALS)
+    atom_features += one_of_k_encoding_unk(atom.HasProp('_CIPCode'), ['R', 'S'])
+    atom_features += [PERIODIC_TABLE.GetRvdw(atom.GetSymbol())]
+    atom_features += [atom.HasProp('_ChiralityPossible')]
+    atom_features += [atom.GetAtomicNum()]
+    atom_features += [atom.GetMass() * 0.01]
+    atom_features += [atom.GetDegree()]
+    atom_features += [int(i) for i in list('{0:06b}'.format(feature))]
+    return atom_features
+def calc_adjacent_tensor(bonds, atom_num, with_ring_conj=False):
+    '''
+    Method that constructs a AdjecentTensor with many AdjecentMatrics
+    :param bonds: bonds of a rdkit mol
+    :param atom_num: the atom number of the rdkit mol
+    :param with_ring_conj: should the AdjecentTensor contains bond in ring and
+        is conjugated info
+    :return: AdjecentTensor A shaped [N, F, N], where N is atom number and F is bond types
+    '''
+    bond_types = len(BOND_TYPES)
+    if with_ring_conj:
+        bond_types += 2
+    A = np.zeros([atom_num, bond_types, atom_num])
+    for bond in bonds:
+        b, e = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+        try:
+            bond_type = BOND_TYPES.index(bond.GetBondType())
+            A[b, bond_type, e] = 1
+            A[e, bond_type, b] = 1
+            if with_ring_conj:
+                if bond.IsInRing():
+                    A[b, bond_types-2, e] = 1
+                    A[e, bond_types-2, b] = 1
+                if bond.GetIsConjugated():
+                    A[b, bond_types-1, e] = 1
+                    A[e, bond_types-1, b] = 1
+        except:
+            pass
+    return A
+def calc_data_from_smile(smiles, addh=False, with_ring_conj=False, with_atom_feats=True, with_submol_fp=True, radius=2):
+    '''
+    Method that constructs the data of a molecular.
+    :param smiles: SMILES representation of a molecule
+    :param addh: should we add all the Hs of the mol
+    :param with_ring_conj: should the AdjecentTensor contains bond in ring and
+        is conjugated info
+    :return: V, A, global_state, mol_size, subgraph_size
+    '''
+    mol = Chem.MolFromSmiles(smiles, sanitize=True)
+    #mol.UpdatePropertyCache(strict=False)
+    if addh:
+        mol = Chem.AddHs(mol)
+    #else:
+    #    mol = Chem.RemoveHs(mol, sanitize=False)
+    mol_size = torch.IntTensor([mol.GetNumAtoms()])
+    V = []
+    if with_atom_feats:
+        features = rdDesc.GetFeatureInvariants(mol)
+    submoldict = {}
+    if with_submol_fp:
+        atoms, submols = subfp.get_atom_submol_radn(mol, radius, sanitize=True)
+        submoldict = dict(zip([a.GetIdx() for a in atoms], submols))
+    for i in range(mol.GetNumAtoms()):
+        atom_i = mol.GetAtomWithIdx(i)
+        if with_atom_feats:
+            atom_i_features = calc_atom_features_onehot(atom_i, features[i])
+        else:
+            atom_i_features = []
+        if with_submol_fp:
+            submol = submoldict[i]
+            #print(Chem.MolToSmiles(submol))
+            submolfp = subfp.gen_fps_from_mol(submol)
+            atom_i_features.extend(submolfp)
+        V.append(atom_i_features)
+    V = torch.FloatTensor(V)
+    if len(V.shape) != 2:
+        return None
+    A = calc_adjacent_tensor(mol.GetBonds(), mol.GetNumAtoms(), with_ring_conj)
+    A = torch.FloatTensor(A)
+    return {'V': V, 'A': A, 'mol_size': mol_size}

code/GNN/layers.py ADDED Viewed

	@@ -0,0 +1,443 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import utils
+import pickle
+DEVICE = torch.cuda.is_available() and torch.device('cuda') or torch.device('cpu')
+class GraphCNNLayer(nn.Module):
+    def __init__(self, n_feats, adj_chans=4, n_filters=64, bias=True):
+        super(GraphCNNLayer, self).__init__()
+        self.n_feats = n_feats
+        self.adj_chans = adj_chans
+        self.n_filters = n_filters
+        self.has_bias = bias
+        # [C*L, F], C = n_feats, L = adj_chans, F = n_filters; this is for the edge feats
+        self.weight_e = nn.Parameter(torch.FloatTensor(adj_chans*n_feats, n_filters))
+        # [C, F], this is for 𝐈𝐕in𝐖0
+        self.weight_i = nn.Parameter(torch.FloatTensor(n_feats, self.n_filters))
+        if bias:
+            self.bias = nn.Parameter(torch.FloatTensor(n_filters))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.weight_e)
+        nn.init.xavier_uniform_(self.weight_i)
+        if self.bias is not None:
+            self.bias.data.fill_(0.01)
+    def forward(self, V, A):
+        '''V node features: [b, N, C], A adjs: [b, N, L, N], L = adj_chans'''
+        b, N, C = V.shape
+        b, N, L, _ = A.shape
+        # formula: 𝐕out = 𝐈𝐕in𝐖0 + GConv(𝐕in, 𝐹) + 𝐛; 𝐈𝐕in = 𝐕in, so 𝐈𝐕in𝐖0 = 𝐕in𝐖0
+        # A [b, N, L, N] -> [b, N*L, N]
+        A_reshape = A.view(-1, N*L, N)
+        # [b, N*L, N] * [b, N, C] -> [b, N*L, C]
+        n = torch.bmm(A_reshape, V)
+        # [b, N*L, C] -> [b, N, L*C]
+        n = n.view(-1, N, L*self.n_feats)
+        # n [b, N, L*C], W [C*L, F], V [b, N, C], W_I [C, F]
+        # -> [b, N, F] + [b, N, F] + b
+        output = torch.matmul(n, self.weight_e) + torch.matmul(V, self.weight_i)
+        if self.has_bias:
+            output += self.bias
+        # output: [b, N, F]
+        return output
+    def __repr__(self):
+        return f'{self.__class__.__name__}(n_feats={self.n_feats},adj_chans={self.adj_chans},n_filters={self.n_filters},bias={self.has_bias}) -> [b, N, {self.n_filters}]'
+class GraphResidualCNNLayer(nn.Module):
+    def __init__(self, n_feats, adj_chans=4, bias=True):
+        super(GraphResidualCNNLayer, self).__init__()
+        self.n_feats = n_feats
+        self.adj_chans = adj_chans
+        self.has_bias = bias
+        # [C*L, F], C = n_feats, L = adj_chans
+        self.weight_layers = nn.ModuleList([nn.Linear(n_feats, n_feats) for _ in range(adj_chans)])
+        if bias:
+            self.bias = nn.Parameter(torch.FloatTensor(n_feats))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self):
+        if self.bias is not None:
+            self.bias.data.fill_(0.01)
+    def forward(self, V, A):
+        '''V node features: [b, N, C], A adjs: [b, N, L, N], L = adj_chans'''
+        b, N, C = V.shape
+        b, N, L, _ = A.shape
+        for i in range(self.adj_chans):
+            # [b, N, C] -> [b, N, C]
+            hs = F.relu(self.weight_layers[i](V))
+            # [b, N, N]
+            a = A[:, :, i, :]
+            a = a.view(-1, N, N)
+            # [b, N, N] * [b, N, C] -> [b, N, C]
+            V = V + torch.bmm(a, hs)
+        if self.has_bias:
+            V += self.bias
+        # output: [b, N, C]
+        return V
+    def __repr__(self):
+        return f'{self.__class__.__name__}(n_feats={self.n_feats},adj_chans={self.adj_chans},bias={self.has_bias}) -> [b, N, {self.n_feats}]'
+class GraphAttentionLayer(nn.Module):
+    def __init__(self, n_feats, adj_chans=4, n_filters=64, bias=True, dropout=0., alpha=0.2):
+        super(GraphAttentionLayer, self).__init__()
+        self.n_feats = n_feats
+        self.adj_chans = adj_chans
+        self.n_filters = n_filters
+        self.has_bias = bias
+        self.dropout = dropout
+        self.alpha = alpha
+        # [C*L, F], C = n_feats, L = adj_chans, F = n_filters; this is for the edge feats
+        self.weight_list = nn.ParameterList([nn.Parameter(torch.FloatTensor(n_feats, n_filters)) for _ in range(adj_chans)])
+        self.a1_list     = nn.ParameterList([nn.Parameter(torch.FloatTensor(n_filters, 1)) for _ in range(adj_chans)])
+        self.a2_list     = nn.ParameterList([nn.Parameter(torch.FloatTensor(n_filters, 1)) for _ in range(adj_chans)])
+        if bias:
+            self.bias = nn.Parameter(torch.FloatTensor(n_filters))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self):
+        for w in self.weight_list:
+            nn.init.xavier_uniform_(w)
+        for w in self.a1_list:
+            nn.init.xavier_uniform_(w)
+        for w in self.a2_list:
+            nn.init.xavier_uniform_(w)
+        if self.bias is not None:
+            self.bias.data.fill_(0.01)
+    def forward(self, V, A):
+        '''V node features: [b, N, C], A adjs: [b, N, L, N], L = adj_chans'''
+        b, N, C = V.shape
+        b, N, L, _ = A.shape
+        output = None
+        # formula: 𝐕out = 𝐈𝐕in𝐖0 + GConv(𝐕in, 𝐹) + 𝐛; 𝐈𝐕in = 𝐕in, so 𝐈𝐕in𝐖0 = 𝐕in𝐖0
+        for i in range(self.adj_chans):
+            # [b, N, 1, N] -> [b, N, N]
+            adj = A[:, :, i, :].view(-1, N, N)
+            # [b, N, C] * [C, F] -> [b, N, F]
+            h = torch.matmul(V, self.weight_list[i])
+            # [b, N, F] * [F, 1] -> [b, N, 1]
+            f_1 = torch.matmul(h, self.a1_list[i])
+            # [b, N, F] * [F, 1] -> [b, N, 1]
+            f_2 = torch.matmul(h, self.a2_list[i])
+            # leaky_relu([b, N, 1] + [b, 1, N]) -> [b, N, N]
+            e = F.leaky_relu(f_1 + f_2.transpose(1, 2), self.alpha)
+            zero_vec = -9e15 * torch.ones_like(e)
+            # [b, N, N]
+            att = torch.where(adj > 0, e, zero_vec)
+            att = F.softmax(att, dim=1)
+            att = F.dropout(att, self.dropout, training=self.training)
+            # [b, N, N] * [b, N, F] -> [b, N, F]
+            if output is None:
+                output = torch.matmul(att, h)
+            else:
+                output += torch.matmul(att, h)
+        if self.has_bias:
+            output += self.bias
+        # output: [b, N, F]
+        return output
+    def __repr__(self):
+        return f'{self.__class__.__name__}(n_feats={self.n_feats},adj_chans={self.adj_chans},n_filters={self.n_filters},bias={self.has_bias},dropout={self.dropout},alpha={self.alpha}) -> [b, N, {self.n_filters}]'
+class GraphNodeCatGlobalFeatures(nn.Module):
+    def __init__(self, global_feats, out_feats, mols=1, bias=True):
+        super(GraphNodeCatGlobalFeatures, self).__init__()
+        self.global_feats = global_feats
+        self.out_feats = out_feats
+        self.mols = mols
+        self.has_bias = bias
+        self.weights = nn.ParameterList([nn.Parameter(torch.FloatTensor(int(global_feats/mols), out_feats)) for _ in range(mols)])
+        self.biass = []
+        if bias:
+            self.biass = nn.ParameterList([nn.Parameter(torch.FloatTensor(out_feats)) for _ in range(mols)])
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self):
+        for weight in self.weights:
+            nn.init.xavier_uniform_(weight)
+        for bias in self.biass:
+            bias.data.fill_(0.01)
+    def forward(self, V, global_state, graph_size, subgraph_size=None):
+        # V: [b, N, Ov], global_state: [b, F], subgraph_size: [b, mols]
+        b, N, Ov = V.shape
+        O = self.out_feats
+        if self.mols == 1:
+            subgraph_size = graph_size.view(-1, 1)
+            global_state = torch.mm(global_state, self.weights[0])
+        else:
+            # global_state: [b, F] view -> [b*mols, F/mols]
+            global_state_view = global_state.view(b*self.mols, -1)
+            # split global_state into that of individual mols
+            idxmols = []
+            for i in range(self.mols):
+                idxmols.append(torch.IntTensor(list(range(i, b*self.mols, self.mols))).to(self.weights[0].device))
+            global_states = []
+            for i, idx in enumerate(idxmols):
+                # selected global_state of mols from global_state_view [b*mols, F/mols]. Out shape is [b, F/mols]
+                gs = global_state_view.index_select(dim=0, index=idx)
+                # gs: [b, F/mols] * weight: [F/mols, O] -> [b, O]; F = global_feats, O = out_feats
+                gs = torch.mm(gs, self.weights[i])
+                if self.has_bias:
+                    gs += self.biass[i]
+                global_states.append(F.relu(gs))
+            # convert global_states back to global_state
+            # [[b, O] ... ] -> [b, mols*O]
+            global_state = torch.cat(global_states, dim=1)
+        # [b, mols*O] || [b, O] -> [b, (mols+1)*O]
+        global_state_new = torch.cat([global_state, torch.zeros(b, O).to(self.weights[0].device)], dim=-1)
+        # [b*(mols+1), O]
+        global_state_new = global_state_new.view(-1, O)
+        repeats = []
+        for sz in subgraph_size:
+            repeats.extend(sz.tolist() + [N-sz.sum()])
+        repeats = torch.tensor(repeats).to(self.weights[0].device)
+        # repeat form [b*(mols+1), O] -> [b*N, O], the content like [m1_feats, m2_feats, ... mn_feats, pads, ...]
+        global_state_new = global_state_new.repeat_interleave(repeats, dim=0)
+        # V view: [b*N, Ov], global_state_new: [b*N, O]
+        output = torch.cat([V.contiguous().view(-1, Ov), global_state_new], dim=1)
+        # output: [b, N, Ov+O]
+        return output.view(-1, N, Ov+O), global_state
+    def __repr__(self):
+        return f'{self.__class__.__name__}(global_feats={self.global_feats},out_feats={self.out_feats},bias={self.has_bias}) -> [b, N, {self.global_feats+self.out_feats}], [b, out_feats]'
+class MultiHeadGlobalAttention(nn.Module):
+    '''Input [b, N, C] -> output [b, n_head*C] if concat or else [b, n_head]'''
+    def __init__(self, n_feats, n_head=5, alpha=0.2, concat=True, bias=True):
+        super(MultiHeadGlobalAttention, self).__init__()
+        self.n_feats = n_feats
+        self.n_head = n_head
+        self.alpha = alpha
+        self.concat = concat
+        self.has_bias = bias
+        self.weight = nn.Parameter(torch.FloatTensor(n_feats, n_head*n_feats))
+        self.tune_weight = nn.Parameter(torch.FloatTensor(1, n_head, n_feats))
+        if bias:
+            self.bias = nn.Parameter(torch.FloatTensor(n_head*n_feats))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.weight)
+        nn.init.xavier_uniform_(self.tune_weight)
+        if self.bias is not None:
+            self.bias.data.fill_(0.01)
+    def forward(self, V, graph_size):
+        # Gather V of mols in a batch, after this, the pad was removed.
+        #print(248, V.shape, graph_size)
+        if V.shape[0] == 1:
+            Vg = torch.squeeze(V)
+            graph_size = [graph_size]
+        else:
+            Vg = torch.cat([torch.split(v.view(-1, v.shape[-1]), graph_size[i])[0] for i,v in enumerate(torch.split(V, 1))], dim=0)
+        Vg = torch.matmul(Vg, self.weight)
+        if self.has_bias:
+            Vg += self.bias
+        Vg = Vg.view(-1, self.n_head, self.n_feats)
+        alpha = torch.mul(self.tune_weight, Vg)
+        alpha = torch.sum(alpha, dim=-1)
+        alpha = F.leaky_relu(alpha, self.alpha) # original code is "alpha = tf.nn.leaky_relu(alpha, alpha=0.2)"
+        alpha = utils.segment_softmax(alpha, graph_size)
+        #alpha_collect = torch.mean(alpha, dim=-1) # origin code like this. alpha_collect not used?
+        alpha = alpha.view(-1, self.n_head, 1)
+        V = torch.mul(Vg, alpha)
+        if self.concat:
+            V = utils.segment_sum(V, graph_size)
+            V = V.view(-1, self.n_head*self.n_feats)
+        else:
+            V = torch.mean(V, dim=1)
+            V = utils.segment_sum(V, graph_size)
+        return V
+    def __repr__(self):
+        if self.concat:
+            outc = self.n_head*self.n_feats
+        else:
+            outc = self.n_head
+        return f'{self.__class__.__name__}(n_feats={self.n_feats},n_head={self.n_head},alpha={self.alpha},concat={self.concat},bias={self.has_bias}) -> [b, {outc}]'
+class GraphEmbedPoolingLayer(nn.Module):
+    def __init__(self, n_feats, n_filters=1, mask=None, bias=True):
+        super(GraphEmbedPoolingLayer, self).__init__()
+        self.n_feats = n_feats
+        self.n_filters = n_filters
+        self.mask = mask
+        self.has_bias = bias
+        self.emb = nn.Linear(n_feats, n_filters, bias=bias)
+    def forward(self, V, A):
+        # [b, N, F]
+        factors = self.emb(V)
+        if self.mask is not None:
+            factors = torch.mul(factors, self.mask)
+        factors = F.softmax(factors, dim=1)
+        # [b, N, F] trans -> [b, F, N] * [b, N, C] -> [b, F, C]
+        result = torch.matmul(factors.transpose(1, 2).contiguous(), V)
+        if self.n_filters == 1:
+            return result.view(-1, self.n_feats), A
+        result_A = A.view(A.shape[0], -1, A.shape[-1])
+        result_A = torch.matmul(result_A, factors)
+        result_A = result_A.view(A.shape[0], A.shape[-1], -1)
+        result_A = torch.matmul(factors.transpose(1, 2).contiguous(), result_A)
+        result_A = result_A.view(A.shape[0], self.n_filters, A.shape[2], self.n_filters)
+        return result, result_A
+    def __repr__(self):
+        return f'{self.__class__.__name__}(n_feats={self.n_feats},n_filters={self.n_filters},mask={self.mask},bias={self.has_bias}) -> [b, {self.n_filters}, {self.n_feats}], [b, {self.n_filters}, L, {self.n_filters}]'
+class GConvBlockWithGF(nn.Module):
+    def __init__(   self,
+                    n_feats,
+                    n_filters,
+                    global_feats,
+                    global_out_feats,
+                    mols=1,
+                    adj_chans=4,
+                    bias=True,
+                    usegat=False):
+        super(GConvBlockWithGF, self).__init__()
+        self.n_feats = n_feats
+        self.n_filters = n_filters
+        self.global_out_feats = global_out_feats
+        self.global_feats = global_feats
+        self.mols = mols
+        self.adj_chans = adj_chans
+        self.has_bias = bias
+        self.usegat = usegat
+        self.broadcast_global_state = GraphNodeCatGlobalFeatures(global_feats, global_out_feats, mols, bias)
+        if usegat:
+            self.graph_conv = GraphAttentionLayer(n_feats+global_out_feats, adj_chans, n_filters)
+        else:
+            self.graph_conv = GraphCNNLayer(n_feats+global_out_feats, adj_chans, n_filters, bias)
+        self.bn_global = nn.BatchNorm1d(global_out_feats*mols)
+        self.bn_graph  = nn.BatchNorm1d(n_filters)
+    def forward(self, V, A, global_state, graph_size, subgraph_size):
+        ######## transfer global_state #########
+        # V shape from [b, N, C] to [b, N, C+F], F is n_filters
+        V, global_state = self.broadcast_global_state(V, global_state, graph_size, subgraph_size)
+        ######## Graph Convolution #########
+        # V shape from [b, N, C+F] to [b, N, F1], F1 is n_filters
+        V = self.graph_conv(V, A)
+        V = self.bn_graph(V.transpose(1, 2).contiguous())
+        V = F.relu(V.transpose(1, 2))
+        global_state = F.relu(self.bn_global(global_state))
+        return V, global_state
+    def __repr__(self):
+        return f'{self.__class__.__name__}(n_feats={self.n_feats},n_filters={self.n_filters},global_feats={self.global_feats},global_out_feats={self.global_out_feats},mols={self.mols},adj_chans={self.adj_chans},bias={self.has_bias},usegat={self.usegat}) -> [b, N, {self.n_filters}], [b, {self.global_out_feats*self.mols}]'
+class GConvBlockNoGF(nn.Module):
+    def __init__(   self,
+                    n_feats,
+                    n_filters,
+                    mols=1,
+                    adj_chans=4,
+                    bias=True):
+        super(GConvBlockNoGF, self).__init__()
+        self.n_feats = n_feats
+        self.n_filters = n_filters
+        self.mols = mols
+        self.adj_chans = adj_chans
+        self.has_bias = bias
+        #self.graph_conv = GraphCNNLayer(n_feats+n_filters, adj_chans, n_filters, bias)
+        self.graph_conv = GraphCNNLayer(n_feats, adj_chans, n_filters, bias)
+        #self.bn_global = nn.BatchNorm1d(n_filters*mols)
+        self.bn_graph  = nn.BatchNorm1d(n_filters)
+    def forward(self, V, A):
+        ######## Graph Convolution #########
+        # V shape from [b, N, C+F] to [b, N, F1], F1 is n_filters
+        V = self.graph_conv(V, A)
+        V = self.bn_graph(V.transpose(1, 2).contiguous())
+        V = F.relu(V.transpose(1, 2))
+        return V
+    def __repr__(self):
+        return f'{self.__class__.__name__}(n_feats={self.n_feats},n_filters={self.n_filters},mols={self.mols},adj_chans={self.adj_chans},bias={self.has_bias}) -> [b, N, {self.n_filters}]'

code/GNN/subgraphfp.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from rdkit import Chem
+from rdkit.Chem import Draw
+from rdkit.Chem import AllChem, MACCSkeys, rdMolDescriptors as rdDesc
+from collections import defaultdict
+import numpy as np
+import os, pickle, hashlib
+AllChem.SetPreferCoordGen(True)
+FINGERPRINT_DICT = defaultdict(lambda : len(FINGERPRINT_DICT))
+ELEMENTS = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al',
+            'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn',
+            'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb',
+            'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In',
+            'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm',
+            'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta',
+            'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At',
+            'Rn', 'Fr', 'Ra', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk',
+            'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt',
+            'Ds', 'Rg', 'Cn', 'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og']
+for e in ELEMENTS:
+    FINGERPRINT_DICT[e]
+if os.path.exists('rdkit_fingerprint_list_r1.pkl'):
+    l = pickle.load(open('rdkit_fingerprint_list_r1.pkl', 'rb'))
+    for smi in l:
+        FINGERPRINT_DICT[smi]
+    print('Len fingerprint_list: %s' %len(FINGERPRINT_DICT)) + len(ELEMENTS)
+def mol_with_atom_index(mol):
+    atoms = mol.GetNumAtoms()
+    for idx in range(atoms):
+        mol.GetAtomWithIdx(idx).SetProp('molAtomMapNumber', str(mol.GetAtomWithIdx(idx).GetIdx()))
+    return mol
+def prepare_mol_for_drawing(mol):
+    try:
+        mol_draw = Draw.rdMolDraw2D.PrepareMolForDrawing(mol)
+    except Chem.KekulizeException:
+        mol_draw = Draw.rdMolDraw2D.PrepareMolForDrawing(mol, kekulize=False)
+        Chem.SanitizeMol(mol_draw, Chem.SANITIZE_ALL ^ Chem.SANITIZE_KEKULIZE)
+    return mol_draw
+def get_atom_submol_radn(mol, radius, sanitize=True):
+    atoms = []
+    submols = []
+    #smis = []
+    for atom in mol.GetAtoms():
+        atoms.append(atom)
+        r = radius
+        while r > 0:
+            try:
+                env = Chem.FindAtomEnvironmentOfRadiusN(mol, r, atom.GetIdx())
+                amap={}
+                submol = Chem.PathToSubmol(mol, env, atomMap=amap)
+                if sanitize:
+                    Chem.SanitizeMol(submol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL^Chem.SanitizeFlags.SANITIZE_KEKULIZE)
+                #smis.append(Chem.MolToSmiles(submol))
+                submols.append(submol)
+                break
+            except Exception as e:
+                print(64, e)
+                r -= 1
+    return atoms, submols #, smis
+def gen_fps_from_mol(mol, nbits=256, use_morgan=True, use_macc=False, use_rdkit=False):
+    # morgan
+    fp = []
+    if use_morgan:
+        fp_vec = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=nbits)
+        fp1 = np.frombuffer(fp_vec.ToBitString().encode(), 'u1') - ord('0')
+        fp = fp1.tolist()
+    if use_macc:
+        # MACCSkeys
+        fp_vec = MACCSkeys.GenMACCSKeys(mol)
+        fp1 = np.frombuffer(fp_vec.ToBitString().encode(), 'u1') - ord('0')
+        fp.extend(fp1.tolist())
+    if use_rdkit:
+        fp_vec = Chem.RDKFingerprint(mol)
+        fp1 = np.frombuffer(fp_vec.ToBitString().encode(), 'u1') - ord('0')
+        fp.extend(fp1.tolist())
+    return fp
+def gen_subgraph_fps_from_str(s, wordsdict={}):
+    if s in wordsdict:
+        return [wordsdict[s]]
+    else:
+        return [len(wordsdict)]
+def gen_subgraph_fps_from_mol(mol, wordsdict={}):
+    try:
+        k = Chem.MolToSmiles(mol)
+        return gen_subgraph_fps_from_str(k, wordsdict)
+    except Exception as e:
+        print(e)
+        return [len(wordsdict)]
+def calc_subgraph_fps_from_mol(mol, radius=2, nbits=128, use_macc=True, fptype=1, wordsdict={}):
+    #atoms, submols, smis = get_atom_submol_radn(mol, radius, True)
+    atoms, submols = get_atom_submol_radn(mol, radius, True)
+    feats = []
+    for idx, submol in enumerate(submols):
+        if fptype == 1:
+            feat = gen_fps_from_mol(submol, nbits, use_macc)
+            feats.append(feat)
+        elif fptype == 2:
+            feat = gen_subgraph_fps_from_mol(submol, wordsdict)
+            feats.append(feat)
+    return np.array(feats)
+if __name__ == '__main__':
+    smi = 'C=C(S)C(N)(O)C'
+    smi = 'CC1CCN(CC1N(C)C2=NC=NC3=C2C=CN3)C(=O)CC#N'
+    mol = Chem.MolFromSmiles(smi, sanitize=False)
+    print(calc_subgraph_fps_from_mol(mol, 3))
+    mol = mol_with_atom_index(mol)
+    submols = get_atom_submol_radn(mol, 3)
+    submols = [prepare_mol_for_drawing(m) for m in submols]
+    hl = []
+    for idx, m in enumerate(submols):
+        for a in m.GetAtoms():
+            if int(a.GetProp('molAtomMapNumber')) == idx:
+                hl.append([a.GetIdx()])
+                break
+    draw = Draw.MolsToGridImage([mol] + submols, highlightAtomLists=[[]] + hl, molsPerRow=5)
+    draw.show()

code/GNN/utils.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import torch
+def gather(x, indices):
+    indices = indices.view(-1, indices.shape[-1]).tolist()
+    out = torch.cat([x[i] for i in indices])
+    return out
+def gather_nd(x, indices):
+    newshape = indices.shape[:-1] + x.shape[indices.shape[-1]:]
+    indices = indices.view(-1, indices.shape[-1]).tolist()
+    out = torch.cat([x[tuple(i)] for i in indices])
+    return out.reshape(newshape)
+def gen_node_indices(size_list):
+    '''generate node index for extraction of nodes of each graph from batched data'''
+    node_num = []
+    node_range = []
+    size_list = [int(i) for i in size_list]
+    for i, n in enumerate(size_list):
+        node_num.extend([i]*n)
+        node_range.extend(list(range(n)))
+    node_num = torch.tensor(node_num)
+    node_range = torch.tensor(node_range)
+    indices = torch.stack([node_num, node_range], axis=1)
+    return indices, node_num, node_range
+def segment_max(x, size_list):
+    size_list = [int(i) for i in size_list]
+    return torch.stack([torch.max(v, 0).values for v in torch.split(x, size_list)])
+def segment_sum(x, size_list):
+    size_list = [int(i) for i in size_list]
+    return torch.stack([torch.sum(v, 0) for v in torch.split(x, size_list)])
+def segment_softmax(gate, size_list):
+    segmax = segment_max(gate, size_list)
+    # expand segmax shape to alpha shape
+    segmax_expand = torch.cat([segmax[i].repeat(n,1) for i,n in enumerate(size_list)], dim=0)
+    subtract = gate - segmax_expand
+    exp = torch.exp(subtract)
+    segsum = segment_sum(exp, size_list)
+    # expand segmax shape to alpha shape
+    segsum_expand = torch.cat([segsum[i].repeat(n,1) for i,n in enumerate(size_list)], dim=0)
+    attention = exp / (segsum_expand + 1e-16)
+    return attention
+def pad_V(V, max_n):
+    N, C = V.shape
+    if max_n > N:
+        zeros = torch.zeros(max_n-N, C)
+        V = torch.cat([V, zeros], dim=0)
+    return V
+def pad_A(A, max_n):
+    N, L, _ = A.shape
+    if max_n > N:
+        zeros = torch.zeros(N, L, max_n-N)
+        A = torch.cat([A, zeros], dim=-1)
+        zeros = torch.zeros(max_n-N, L, max_n)
+        A = torch.cat([A, zeros], dim=0)
+    return A
+def pad_prot(P, max_n):
+    N, = P.shape
+    if max_n > N:
+        zeros = torch.zeros(max_n-N)
+        P = torch.cat([P, zeros], dim=0)
+    return P.type(torch.IntTensor)
+def create_batch(input, pad=False, device=torch.device('cpu')):
+    vl = []
+    al = []
+    gsl = []
+    msl = []
+    ssl = []
+    lbl = []
+    idxs = []
+    smis = []
+    for d in input:
+        vl.append(d['V'])
+        al.append(d['A'])
+        gsl.append(d['G'])
+        msl.append(d['mol_size'])
+        ssl.append(d['subgraph_size'])
+        lbl.append(d['label'])
+        idxs.append(d['index'])
+        smis.append(d['smiles'])
+    if gsl[0] is not None:
+        gsl = torch.stack(gsl, dim=0).to(device)
+    if pad:
+        max_n = max(map(lambda x:x.shape[0], vl))
+        vl1 = []
+        for v in vl:
+            vl1.append(pad_V(v, max_n))
+        al1 = []
+        for a in al:
+            al1.append(pad_A(a, max_n))
+        return {'V': torch.stack(vl1, dim=0).to(device),
+                'A': torch.stack(al1, dim=0).to(device),
+                'G': gsl,
+                'mol_size': torch.cat(msl, dim=0).to(device),
+                'subgraph_size': torch.stack(ssl, dim=0).to(device),
+                'label': torch.stack(lbl, dim=0).to(device),
+                'index': idxs,
+                'smiles': smis}
+    return {'V': torch.stack(vl, dim=0).to(device),
+            'A': torch.stack(al, dim=0).to(device),
+            'G': gsl,
+            'mol_size': torch.cat(msl, dim=0).to(device),
+            'subgraph_size': torch.stack(ssl, dim=0).to(device),
+            'label': torch.stack(lbl, dim=0).to(device),
+            'index': idxs,
+            'smiles': smis}
+def create_mol_protein_batch(input, pad=False, device=torch.device('cpu'), pr=True):
+    vl = []
+    al = []
+    gsl = []
+    msl = []
+    ssl = []
+    prot = []
+    seq = []
+    lbl = []
+    idxs = []
+    smis = []
+    fpl = []
+    for d in input:
+        vl.append(d['V'])
+        al.append(d['A'])
+        gsl.append(d['G'])
+        msl.append(d['mol_size'])
+        ssl.append(d['subgraph_size'])
+        prot.append(d['protein_seq'])
+        seq.append(d['protein'])
+        lbl.append(d['label'])
+        idxs.append(d['index'])
+        smis.append(d['smiles'])
+        if 'fp' in d:
+            fpl.append(d['fp'])
+    if gsl[0] is not None:
+        if pad:
+            gsl = torch.stack(gsl, dim=0).to(device)
+        else:
+            gsl = [torch.unsqueeze(g, 0) for g in gsl]
+    if pad:
+        max_n = max(map(lambda x:x.shape[0], vl))
+        vl1 = []
+        if pr:
+            print('\tPadding V to max_n:', max_n)
+        for v in vl:
+            vl1.append(pad_V(v, max_n))
+        al1 = []
+        if pr:
+            print('\tPadding A to max_n:', max_n)
+        for a in al:
+            al1.append(pad_A(a, max_n))
+        max_prot = max(map(lambda x:x.shape[0], prot))
+        prot1 = []
+        if pr:
+            print('\tPadding protein_seq to max_n:', max_prot)
+        for p in prot:
+            prot1.append(pad_prot(p, max_prot))
+        fpt = None
+        if fpl:
+            fpt = torch.stack(fpl, dim=0).to(device)
+        return {'V': torch.stack(vl1, dim=0).to(device),
+                'A': torch.stack(al1, dim=0).to(device),
+                'G': gsl,
+                'fp': fpt,
+                'mol_size': torch.cat(msl, dim=0).to(device),
+                'subgraph_size': torch.stack(ssl, dim=0).to(device),
+                'protein_seq': torch.stack(prot1, dim=0).to(device),
+                'label': torch.stack(lbl, dim=0).view(-1).to(device),
+                'index': idxs,
+                'smiles': smis,
+                'protein': seq}
+    return {'V': [torch.unsqueeze(v, 0) for v in vl],
+            'A': [torch.unsqueeze(a, 0) for a in al],
+            'G': gsl,
+            'fp': fpt,
+            'mol_size': torch.cat(msl, dim=0).to(device),
+            'subgraph_size': [torch.unsqueeze(s, 0) for s in ssl],
+            'protein_seq': [torch.unsqueeze(p, 0) for p in prot],
+            'label': torch.stack(lbl, dim=0).view(-1).to(device),
+            'index': idxs,
+            'smiles': smis,
+            'protein': seq}
+def create_mol_protein_fp_batch(input, pad=False, device=torch.device('cpu'), pr=True):
+    fp = []
+    prot = []
+    lbl = []
+    idxs = []
+    smis = []
+    for d in input:
+        fp.append(d['fp'])
+        prot.append(d['protein_seq'])
+        lbl.append(d['label'])
+        idxs.append(d['index'])
+        smis.append(d['smiles'])
+    if pad:
+        max_prot = max(map(lambda x:x.shape[0], prot))
+        prot1 = []
+        if pr:
+            print('\tPadding protein_seq to max_n:', max_prot)
+        for p in prot:
+            prot1.append(pad_prot(p, max_prot))
+        return {'fp': torch.stack(fp, dim=0).to(device),
+                'protein_seq': torch.stack(prot1, dim=0).to(device),
+                'label': torch.stack(lbl, dim=0).view(-1).to(device),
+                'index': idxs,
+                'smiles': smis}
+    return {'fp':          [torch.unsqueeze(f, 0) for f in fp],
+            'protein_seq': [torch.unsqueeze(p, 0) for p in prot],
+            'label': torch.stack(lbl, dim=0).view(-1).to(device),
+            'index': idxs,
+            'smiles': smis}

code/cliplayers.py ADDED Viewed

	@@ -0,0 +1,432 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu3(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1], key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x.squeeze(0)
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            x = self.relu1(self.bn1(self.conv1(x)))
+            x = self.relu2(self.bn2(self.conv2(x)))
+            x = self.relu3(self.bn3(self.conv3(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+class VisionTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)
+def build_model(state_dict: dict):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith("transformer.resblocks")))
+    model = CLIP(
+        embed_dim,
+        image_resolution, vision_layers, vision_width, vision_patch_size,
+        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
+    )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    convert_weights(model)
+    model.load_state_dict(state_dict)
+    return model.eval()

code/config.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch, json, math, os
+d = {
+    'debug': True,
+    'dataset_path': 'data/path_to_your_dataset.json',
+    'fptype': 'morgan',
+    'valid_ratio': 0.1,
+    'batch_size': 128,
+    'lr': 1e-3,
+    'weight_decay': 1e-3,
+    'patience': 2,
+    'factor': 0.5,
+    'add_nl': True,
+    'binary_intn': False,
+    'max_mz': 2000,
+    'min_mz': 20,
+    'energy': 'Energy1',
+    'epochs': 50,
+    'bin_size': 0.05,
+    'ms_embedding_dim': 300,
+    'projection_dim': 256,
+    'ms_projection_layers': 1,
+    'mol_embedding_dim': 2048,
+    'mol_projection_layers': 1,
+    'tsfm_in_ms': True,
+    'tsfm_in_mol': False,
+    'tsfm_layers': 6,
+    'tsfm_heads': 8,
+    'lstm_layers': 2,
+    'lstm_in_ms': False,
+    'lstm_in_mol': False,
+    'dropout': 0.1,
+    'nmodels': 1,
+    'mol_encoder': 'fp', # fp, gnn or gnn+fp
+    'molgnn_n_filters_list': [256, 256, 256],
+    'molgnn_nhead': 4,
+    'molgnn_readout_layers': 2,
+    'seed': 1234,
+    'dev_name': 'cuda',
+    'keep_best_models_num': 3
+}
+class ConfigDict(dict):
+    '''
+    Makes a  dictionary behave like an object,with attribute-style access.
+    '''
+    def __getattr__(self, name):
+        try:
+            return self[name]
+        except:
+            raise AttributeError(name)
+    def __setattr__(self, name, value):
+        self[name] = value
+    def save(self, fn, onlyprint=False):
+        if onlyprint:
+            print(self)
+        else:
+            json.dump(self, open(fn, 'w'), indent=2)
+    def load_dict(self, dic):
+        for k, v in dic.items():
+            self[k] = v
+        self.calc_ms_embedding_dim()
+    def load(self, fn):
+        try:
+            if type(fn) is dict:
+                d = fn
+            elif type(fn) is str:
+                if os.path.exists(fn):
+                    d = json.load(open(fn, 'r'))
+                else:
+                    d = json.loads(fn)
+            self.load_dict(d)
+        except Exception as e:
+            print(e)
+    def calc_ms_embedding_dim(self):
+        if 'bin_size' in self:
+            self['ms_embedding_dim'] = math.ceil((self['max_mz'] - self['min_mz']) / self['bin_size'])
+        if 'ms_embedding_dim' in self and 'add_nl' in self and self['add_nl']:
+            self['ms_embedding_dim'] += math.ceil((200) / self['bin_size'])
+    @property
+    def device(self):
+        try:
+            return torch.device(self['dev_name'])
+        except:
+            return torch.device('cpu')
+CFG = ConfigDict()
+CFG.load_dict(d)

code/dataset.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import os, json
+import torch
+import utils
+def calc_feats(smi, ms, nls, cfg):
+    item = {}
+    item['ms_bins'] = utils.ms_binner(ms, nls,
+                                      min_mz=cfg.min_mz,
+                                      max_mz=cfg.max_mz,
+                                      bin_size=cfg.bin_size,
+                                      add_nl=cfg.add_nl,
+                                      binary_intn=cfg.binary_intn)
+    fmcalced = False
+    if 'fp' in cfg.mol_encoder:
+        if not 'fm' in cfg.mol_encoder:
+            item['mol_fps'] = utils.mol_fp_encoder(smi,
+                                               tp=cfg.fptype,
+                                               nbits=cfg.mol_embedding_dim)
+        else:
+            item['mol_fps'], item['mol_fmvec'] = utils.mol_fp_fm_encoder(smi,
+                                                tp=cfg.fptype,
+                                                nbits=cfg.mol_embedding_dim)
+            fmcalced = True
+    if 'gnn' in cfg.mol_encoder:
+        f = utils.mol_graph_featurizer(smi)
+        if not f:
+            return None
+        item.update(f)
+        if 'fm' in cfg.mol_encoder and not fmcalced:
+            item['mol_fmvec'] = utils.smi2fmvec(smi)
+    return item
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self, inp, cfg):
+        if type(inp) is str:
+            self.data = json.load(open(inp))
+        else:
+            self.data = inp
+        self.cfg = cfg
+    def __getitem__(self, idx):
+        item = {}
+        try:
+            if 'nls' in self.data[idx]:
+                nls = self.data[idx]['nls']
+            else:
+                nls = []
+            ms = self.data[idx]['ms']
+            smi = self.data[idx]['smiles']
+            item = calc_feats(smi, ms, nls, self.cfg)
+        except Exception as e:
+            print('='*50, idx, str(e))
+            return None
+        return item
+    def __len__(self):
+        return len(self.data)
+class DatasetGNNFP(torch.utils.data.Dataset):
+    def __init__(self, inp, cfg):
+        if type(inp) is str:
+            self.data = json.load(open(inp))
+        else:
+            self.data = inp
+        self.cfg = cfg
+    def __getitem__(self, idx):
+        try:
+            smi = self.data[idx]['smiles']
+            item = {}
+            item['mol_fps'] = utils.mol_fp_encoder(smi,
+                                                   tp=self.cfg.fptype,
+                                                   nbits=self.cfg.mol_embedding_dim)
+            item.update(utils.mol_graph_featurizer(smi))
+        except Exception as e:
+            print('='*50, idx, str(e))
+            return None
+        return item
+    def __len__(self):
+        return len(self.data)
+class PathDataset(torch.utils.data.Dataset):
+    def __init__(self, pathlist, cfg):
+        self.fns = pathlist
+        self.cfg = cfg
+        self.data = {}
+    def __getitem__(self, idx):
+        try:
+            item = {}
+            nls = []
+            if not idx in self.data:
+                out = self.proc_data(self.fns[idx], self.cfg.energy)
+                if out is None:
+                    return None
+                self.data[idx] = out
+            ms = self.data[idx]['ms']
+            smi = self.data[idx]['smiles']
+            item = calc_feats(smi, ms, nls, self.cfg)
+        except Exception as e:
+            print('='*50, idx, str(e))
+            return None
+        return item
+    def proc_data(self, fn, energy='Energy1'):
+        tl = open(fn).readlines()
+        l = []
+        try:
+            flag = False
+            for i in tl:
+                if energy in i:
+                    smi = i.split(';')[-2]
+                    flag = True
+                    continue
+                if 'END IONS' in i:
+                    if flag:
+                        break
+                if flag:
+                    mz, intn = i.split(' ')
+                    l.append((float(mz), float(intn)))
+        except:
+            return None
+        out = {'ms': l, 'smiles': smi}
+        return out
+    def __len__(self):
+        return len(self.fns)

code/modules.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from config import CFG
+import utils
+import math
+import numpy as np
+from cliplayers import QuickGELU, Transformer as MSTsfmEncoder
+from GNN import layers as gly
+loss_func_ms  = nn.CrossEntropyLoss()
+loss_func     = nn.CrossEntropyLoss()
+class MolGNNEncoder(nn.Module):
+    def __init__(self,
+                 outdim,
+                 n_feats=74, #330, # 74+256 morgan 256
+                 n_filters_list=[256, 256, 256],
+                 n_head=4,
+                 mols=1,
+                 adj_chans=6,
+                 readout_layers=2,
+                 bias=True):
+        super().__init__()
+        n_filters_list = [i for i in n_filters_list if i is not None]
+        lys = []
+        for i, nf in enumerate(n_filters_list):
+            if i == 0:
+                nf1 = n_feats
+            else:
+                nf1 = prevnf
+            prevnf = nf
+            ly = gly.GConvBlockNoGF(nf1, nf, mols, adj_chans, bias)
+            lys.append(ly)
+        self.block_layers = nn.ModuleList(lys)
+        self.attention_layer = gly.MultiHeadGlobalAttention(nf, n_head=n_head, concat=True, bias=bias)
+        self.readout_layers = nn.ModuleList([nn.Linear(nf*n_head, outdim, bias=bias)] + [nn.Linear(outdim, outdim) for _ in range(readout_layers-1)])
+        self.gelu = QuickGELU()
+    def forward(self, batch):
+        V        = batch['V']
+        A        = batch['A']
+        mol_size = batch['mol_size']
+        for ly in self.block_layers:
+            V = ly(V, A)
+        X = self.attention_layer(V, mol_size)
+        for ly in self.readout_layers:
+            X = self.gelu(ly(X))
+        return X
+class ProjectionHead(nn.Module):
+    def __init__(self,
+                 embedding_dim,
+                 projection_dim,
+                 cfg,
+                 transformer=True,
+                 lstm=False):
+        super().__init__()
+        self.projection = nn.Linear(embedding_dim, projection_dim)
+        self.gelu = nn.GELU() #QuickGELU()
+        self.transformer = None
+        if transformer:
+            self.transformer = MSTsfmEncoder(projection_dim, cfg.tsfm_layers, cfg.tsfm_heads)
+        self.lstm = None
+        if lstm:
+            self.lstm = nn.LSTM(input_size=projection_dim, hidden_size=projection_dim, num_layers=cfg.lstm_layers, batch_first=True)
+        self.dropout = nn.Dropout(cfg.dropout)
+    def forward(self, x):
+        projected = self.projection(x)
+        if self.transformer is None:
+            x = self.gelu(projected)
+        else:
+            x = self.transformer(projected)
+        if not self.lstm is None:
+            x, (_, _) = self.lstm(x)
+        x = self.dropout(x)
+        return x
+# New name in paper is CMSSPModel
+class FragSimiModel(nn.Module):
+    def __init__(
+        self,
+        cfg
+    ):
+        super().__init__()
+        self.cfg = cfg
+        self.mol_gnn_encoder = None
+        mol_embedding_dim = cfg.mol_embedding_dim
+        if 'gnn' in self.cfg.mol_encoder:
+            self.mol_gnn_encoder = MolGNNEncoder(outdim=cfg.mol_embedding_dim,
+                                                 n_filters_list=cfg.molgnn_n_filters_list,
+                                                 n_head=cfg.molgnn_nhead,
+                                                 readout_layers=cfg.molgnn_readout_layers)
+            if 'fp' in self.cfg.mol_encoder:
+                mol_embedding_dim = 2*cfg.mol_embedding_dim
+        if 'fm' in self.cfg.mol_encoder:
+            mol_embedding_dim += 10
+        self.ms_projection  = ProjectionHead(cfg.ms_embedding_dim,
+                                             cfg.projection_dim,
+                                             cfg,
+                                             cfg.tsfm_in_ms,
+                                             cfg.lstm_in_ms)
+        self.mol_projection = ProjectionHead(mol_embedding_dim,
+                                             cfg.projection_dim,
+                                             cfg,
+                                             cfg.tsfm_in_mol,
+                                             cfg.lstm_in_mol)
+    def forward(self, batch):
+        ms_features = batch["ms_bins"]
+        mol_feat_list = []
+        if 'gnn' in self.cfg.mol_encoder:
+            mol_feat_list.append(self.mol_gnn_encoder(batch))
+        if 'fp' in self.cfg.mol_encoder:
+            mol_feat_list.append(batch["mol_fps"])
+        if 'fm' in self.cfg.mol_encoder:
+            mol_feat_list.append(batch["mol_fmvec"])
+        if len(mol_feat_list) > 1:
+            mol_features = torch.cat(mol_feat_list, dim=1)
+        else:
+            mol_features = mol_feat_list[0]
+        # Getting ms and mol Embeddings (with same dimension)
+        ms_embeddings = self.ms_projection(ms_features)
+        mol_embeddings = self.mol_projection(mol_features)
+        # Calculating the Loss
+        #logits = (mol_embeddings @ ms_embeddings.t())
+        #logit_scale = self.logit_scale.exp()
+        logits = mol_embeddings @ ms_embeddings.t()
+        ground_truth = torch.arange(ms_features.shape[0], dtype=torch.long, device=self.cfg.device)
+        ms_loss = loss_func(logits, ground_truth)
+        mol_loss = loss_func(logits.t(), ground_truth)
+        loss =  (ms_loss + mol_loss) / 2.0 # shape: (batch_size)
+        return loss.mean()

code/predict.py ADDED Viewed

	@@ -0,0 +1,347 @@

+from modules import *
+import os, sys
+import numpy as np
+from tqdm import tqdm
+import torch
+from torch import nn
+from config import CFG
+import utils
+import json
+import pandas as pd
+import pickle
+MolFeatsCached = {}
+def calc_mol_embeddings0(model, smis, cfg):
+    model.eval()
+    valid_mol_embeddings = []
+    with torch.no_grad():
+        for smi in smis:
+            try:
+                mol_features = utils.mol_fp_encoder(smi, tp=cfg.fptype, nbits=cfg.mol_embedding_dim).to(cfg.device)
+                mol_embeddings = model.mol_projection(mol_features.unsqueeze(0))
+                valid_mol_embeddings.append(mol_embeddings.squeeze(0))
+            except Exception as e:
+                print(smi, e)
+                continue
+    return torch.stack(valid_mol_embeddings)
+def calc_mol_embeddings1(model, smis, cfg):
+    model.eval()
+    mol_embeddings = []
+    with torch.no_grad():
+        for smi in smis:
+            try:
+                if cfg.mol_encoder == 'fp':
+                    k = hash(smi + f'fp-{cfg.fptype}-{cfg.mol_embedding_dim}')
+                    if k in MolFeatsCached:
+                        feats = MolFeatsCached[k]
+                    else:
+                        feats = utils.mol_fp_encoder(smi, tp=cfg.fptype, nbits=cfg.mol_embedding_dim).to(cfg.device)
+                        MolFeatsCached[k] = feats
+                    me = model.mol_projection(feats.unsqueeze(0))
+                    mol_embeddings.append(me.squeeze(0))
+                elif cfg.mol_encoder == 'gnn':
+                    k = hash(smi + 'gnn')
+                    if k in MolFeatsCached:
+                        gfeats = MolFeatsCached[k]
+                    else:
+                        gfeats = utils.mol_graph_featurizer(smi)
+                        MolFeatsCached[k] = gfeats
+                    bat = {'A': gfeats['A'].unsqueeze(0).to(cfg.device),
+                           'V': gfeats['V'].unsqueeze(0).to(cfg.device),
+                           'mol_size': gfeats['mol_size'].unsqueeze(0).to(cfg.device)}
+                    feats = model.mol_gnn_encoder(bat)
+                    me = model.mol_projection(feats)
+                    mol_embeddings.append(me.squeeze(0))
+            except Exception as e:
+                print(smi, e)
+                continue
+    return torch.stack(mol_embeddings)
+def calc_mol_embeddings(model, smis, cfg):
+    model.eval()
+    fp_featsl = []
+    gnn_featsl = []
+    fm_featsl = []
+    for smi in smis:
+        try:
+            if 'gnn' in cfg.mol_encoder:
+                k = hash(smi + 'gnn')
+                if k in MolFeatsCached:
+                    gnn_feats = MolFeatsCached[k]
+                    if gnn_feats is None:
+                        continue
+                else:
+                    gnn_feats = utils.mol_graph_featurizer(smi)
+                    MolFeatsCached[k] = gnn_feats
+                    if gnn_feats is None:
+                        continue
+                gnn_featsl.append(gnn_feats)
+            if 'fp' in cfg.mol_encoder:
+                k = hash(smi + f'fp-{cfg.fptype}-{cfg.mol_embedding_dim}')
+                if k in MolFeatsCached:
+                    fp_feats = MolFeatsCached[k]
+                    if fp_feats is None:
+                        continue
+                else:
+                    fp_feats = utils.mol_fp_encoder(smi, tp=cfg.fptype, nbits=cfg.mol_embedding_dim).to(cfg.device)
+                    MolFeatsCached[k] = fp_feats
+                fp_featsl.append(fp_feats)
+            if 'fm' in cfg.mol_encoder:
+                k = hash(smi + f'fm-{cfg.fptype}-{cfg.mol_embedding_dim}')
+                if k in MolFeatsCached:
+                    fm_feats = MolFeatsCached[k]
+                    if fm_feats is None:
+                        continue
+                else:
+                    fm_feats = utils.smi2fmvec(smi).to(cfg.device)
+                    MolFeatsCached[k] = fm_feats
+                fm_featsl.append(fm_feats)
+        except Exception as e:
+            print(smi, e)
+            MolFeatsCached[k] = None
+            continue
+    mol_feat_list = []
+    if 'gnn' in cfg.mol_encoder:
+        vl, al, msl = [], [], []
+        bat = {}
+        for b in gnn_featsl:
+            if 'V' in b:
+                vl.append(b['V'])
+            if 'A' in b:
+                al.append(b['A'])
+            if 'mol_size' in b:
+                msl.append(b['mol_size'])
+        vl1, al1 = [], []
+        if vl and al and msl:
+            max_n = max(map(lambda x:x.shape[0], vl))
+            for v in vl:
+                vl1.append(utils.pad_V(v, max_n))
+            for a in al:
+                al1.append(utils.pad_A(a, max_n))
+        bat['V'] = torch.stack(vl1).to(cfg.device)
+        bat['A'] = torch.stack(al1).to(cfg.device)
+        bat['mol_size'] = torch.cat(msl, dim=0).to(cfg.device)
+        mol_feat_list.append(model.mol_gnn_encoder(bat))
+    if 'fp' in cfg.mol_encoder:
+        mol_feat_list.append(torch.stack(fp_featsl).to(cfg.device))
+    if 'fm' in cfg.mol_encoder:
+        mol_feat_list.append(torch.stack(fm_featsl).to(cfg.device))
+    if len(mol_feat_list) > 1:
+        mol_features = torch.cat(mol_feat_list, dim=1).to(cfg.device)
+    else:
+        mol_features = mol_feat_list[0].to(cfg.device)
+    with torch.no_grad():
+        mol_embeddings = model.mol_projection(mol_features)
+    return mol_embeddings
+def find_matches(model, ms, smis, cfg, n=10):
+    model.eval()
+    with torch.no_grad():
+        ms_features = utils.ms_binner(ms, min_mz=cfg.min_mz, max_mz=cfg.max_mz, bin_size=cfg.bin_size, add_nl=cfg.add_nl, binary_intn=cfg.binary_intn).to(cfg.device)
+        ms_features = ms_features.unsqueeze(0)
+        ms_embeddings = model.ms_projection(ms_features).squeeze(0)
+        #print(43, ms_features.shape, ms_embeddings.shape)
+    mol_embeddings  = calc_mol_embeddings(model, smis, cfg)
+    mol_embeddings_n = F.normalize(mol_embeddings, p=2, dim=-1)
+    ms_embeddings_n = F.normalize(ms_embeddings, p=2, dim=-1)
+    dot_similarity =  mol_embeddings_n @ ms_embeddings_n.t()
+    if n == -1 or n > len(mol_embeddings):
+        n = len(mol_embeddings)
+    values, indices = torch.topk(dot_similarity.squeeze(0), n)
+    matchsmis   = [smis[idx] for idx in indices]
+    return matchsmis, values.to('cpu').data.numpy()*100, indices.to('cpu').data.numpy()
+def calc(models, datal, cfg, saveout=True):
+    dicall = {}
+    coridxd = {}
+    for idx, model in enumerate(models):
+        for nn, data in enumerate(datal):
+            print(f'Calculating {nn}-th MS...')
+            #smipool = [d[1] for d in data['candidates'][:50]]
+            smipool = [d[1] for d in data['candidates']]
+            try:
+                smis, scores, indices = find_matches(model, data['ms'], smipool, cfg, 50)
+            except Exception as e:
+                print(131, e)
+                continue
+            dic = {}
+            for n, smi in enumerate(smis):
+                if smi in dic:
+                    dic[smi]['score'] += scores[n]
+                    dic[smi]['iscor'] = data['candidates'][indices[n]][-1]
+                    dic[smi]['idx'] = data['candidates'][indices[n]][0]
+                else:
+                    dic[smi] = {'score': scores[n], 'iscor': data['candidates'][indices[n]][-1], 'idx': data['candidates'][indices[n]][0]}
+            ikey = data['ikey']
+            if ikey in dicall:
+                for k, v in dic.items():
+                    if k in dicall[ikey]:
+                        dicall[ikey][k]['score'] += v['score']
+                    else:
+                        dicall[ikey][k] = v
+            else:
+                dicall[ikey] = dic
+    for ikey, dic in dicall.items():
+        smis   = [k for k in dic.keys()]
+        scorel = [d['score'] for d in dic.values()]
+        iscorl = [d['iscor'] for d in dic.values()]
+        indexl = [d['idx'] for d in dic.values()]
+        scoretsor = torch.tensor(scorel)
+        n = 100
+        if n > len(scorel):
+            n = len(scorel)
+        values, indices = torch.topk(scoretsor, n)
+        scorel = values
+        smis   = [smis[i] for i in indices]
+        iscorl = [iscorl[i] for i in indices]
+        indexl = [indexl[i] for i in indices]
+        try:
+            i = iscorl.index(True)
+            k = 'Hit %.3d' %(i+1)
+            if k in coridxd:
+                coridxd[k] += 1
+            else:
+                coridxd[k] = 1
+        except:
+            pass
+    ks = sorted(list(coridxd.keys()))
+    dc = {}
+    sumtop3 = 0
+    for k in ks:
+        dc[k] = [coridxd[k]]
+        if k in ['Hit 001', 'Hit 002', 'Hit 003']:
+            sumtop3 += coridxd[k]
+    for i in range(100):
+        k = 'Hit %.3d' %(i+1)
+        if not k in dc:
+            dc[k] = [0]
+    '''if saveout:
+        df0 = pd.DataFrame(dc)
+        df0.to_csv('summary.csv', index=False)
+        df = pd.DataFrame({
+                         'MSFn': ikeysl,
+                         'Item': iteml,
+                         'Index': smisidl,
+                         'Smiles': smis,
+                         'Score': scoresl,
+                         'IsCorrect': iscorl})
+        df.to_csv('predicted.csv', index=False)'''
+    return sumtop3, dc, dicall
+def test(modelfnl, datal, datafn=''):
+    maxtop3 = 0
+    maxoutt = ''
+    for fn in modelfnl:
+        d = torch.load(fn)
+        CFG.load(d['config'])
+        print(d['config'])
+        CFG.save('', True)
+        model = FragSimiModel(CFG).to(CFG.device)
+        model.load_state_dict(d['state_dict'])
+        model.to(CFG.device)
+        sumtop3, dc, dicall = calc([model], datal, CFG, saveout=False)
+        sumtop10 = 0
+        for k in ['Hit %.3d' %(i+1) for i in range(10)]:
+            if k in dc:
+                sumtop10 += dc[k][0]
+        sumtop50 = 0
+        for k in ['Hit %.3d' %(i+1) for i in range(50)]:
+            if k in dc:
+                sumtop50 += dc[k][0]
+        tops = {}
+        for i in range(100):
+            k = 'Hit %.3d' %(i+1)
+            key = k.replace('Hit', 'Top')
+            if not key in tops:
+                tops[key] = [0]
+            if k in dc:
+                for n in range(i+1):
+                    kk = 'Hit %.3d' %(n+1)
+                    if kk in dc:
+                        tops[key][0] += dc[kk][0]
+        outt = f'Top1: {dc.setdefault("Hit 001", [0])[0]}, top3: {sumtop3}, top10: {sumtop10}, top50: {sumtop50} of {len(datal)}'
+        if sumtop3 > maxtop3:
+            maxtop3 = sumtop3
+            maxoutt = outt
+        dicall['testdata'] = datafn
+        dicall['testrlt'] = outt
+        pickle.dump(dicall, open(fn.replace('.pth', f'-{os.path.basename(datafn).split(".")[0]}-tstrlt.pkl'), 'wb'))
+        df = pd.DataFrame(tops)
+        df.to_csv(fn.replace('.pth', f'-{os.path.basename(datafn).split(".")[0]}-tstrlt.csv'), index=False)
+    return maxoutt, maxtop3
+def main(datafn, fnl):
+    outl = []
+    datal = json.load(open(datafn))
+    logfn = f'predict_results.csv'
+    if not os.path.exists(logfn):
+        open(logfn, 'w').write('Index,Results,Model,Data\n')
+    n = 0
+    for n, fn in enumerate(fnl):
+        out, _ = test([fn], datal, datafn)
+        print(out, os.path.basename(fn))
+        outl.append(out)
+        open(logfn, 'a').write(f'{n},"{out}",{fn},{datafn}\n')
+    print(outl)
+if __name__ == '__main__':
+    import time
+    t0 = time.time()
+    main(sys.argv[1], sys.argv[2:])
+    print(300, time.time()-t0)

code/separate_posneg.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import json
+from tqdm import tqdm
+if __name__ == '__main__':
+    import sys
+    fn = sys.argv[1]
+    d = json.load(open(fn))
+    lpos = []
+    lneg = []
+    for n, it in enumerate(d):
+        print(f'processing {n+1}th...')
+        try:
+            if it['Ion_Mode'].strip().lower() == 'negative':
+                lneg.append(it)
+            else:
+                lpos.append(it)
+        except:
+            if it['species'].strip().endswith('-'):
+                lneg.append(it)
+            else:
+                lpos.append(it)
+    print(f'Len lpos = {len(lpos)}, len lneg = {len(lneg)}, sum = {len(lpos)+len(lneg)}')
+    json.dump(lpos, open(fn.replace('.json', '-pos.json'), 'w'), indent=2)
+    json.dump(lneg, open(fn.replace('.json', '-neg.json'), 'w'), indent=2)

code/train.py ADDED Viewed

	@@ -0,0 +1,251 @@

+from utils import *
+from modules import *
+import os, sys
+import numpy as np
+from tqdm import tqdm
+import random
+import torch
+from torch import nn
+from config import CFG
+from dataset import *
+import torch.utils.data
+import copy, json, pickle
+import itertools as it
+def make_next_record_dir(basedir, prefix=''):
+    path = '%s/%%s001/' %basedir
+    n = 2
+    while os.path.exists(path %prefix):
+        path = '%s/%%s%.3d/' %(basedir, n)
+        n += 1
+    pth = path %prefix
+    os.makedirs(pth)
+    return pth
+def setup_seed(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+def my_collate(batch):
+    batch = list(filter(lambda x:(x is not None), batch))
+    msbinl, molfpl, molfml, vl, al, msl = [], [], [], [], [], []
+    bat = {}
+    for b in batch:
+        if 'ms_bins' in b:
+            msbinl.append(b['ms_bins'])
+        if 'mol_fps' in b:
+            molfpl.append(b['mol_fps'])
+        if 'mol_fmvec' in b:
+            molfml.append(b['mol_fmvec'])
+        if 'V' in b:
+            vl.append(b['V'])
+        if 'A' in b:
+            al.append(b['A'])
+        if 'mol_size' in b:
+            msl.append(b['mol_size'])
+    if msbinl:
+        bat['ms_bins'] = torch.stack(msbinl)
+    if molfpl:
+        bat['mol_fps'] = torch.stack(molfpl)
+    if molfml:
+        bat['mol_fmvec'] = torch.stack(molfml)
+    if vl and al and msl:
+        max_n = max(map(lambda x:x.shape[0], vl))
+        vl1, al1 = [], []
+        for v in vl:
+            vl1.append(pad_V(v, max_n))
+        for a in al:
+            al1.append(pad_A(a, max_n))
+        bat['V'] = torch.stack(vl1)
+        bat['A'] = torch.stack(al1)
+        bat['mol_size'] = torch.cat(msl, dim=0)
+    #return torch.utils.data.dataloader.default_collate(batch)
+    return bat
+def make_train_valid(data, valid_ratio, seed=1234):
+    idxs = np.arange(len(data))
+    np.random.seed(seed)
+    np.random.shuffle(idxs)
+    lenval = int(valid_ratio*len(data))
+    valid_set = [ data[i] for i in idxs[:lenval] ]
+    train_set = [ data[i] for i in idxs[lenval:] ]
+    return train_set, valid_set
+def build_loaders(inp, mode, cfg, num_workers):
+    if type(inp[0]) is dict:
+        dataset = Dataset(inp, cfg)
+    else:
+        dataset = PathDataset(inp, cfg)
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=cfg.batch_size,
+        num_workers=num_workers,
+        shuffle=True if mode == "train" else False,
+        collate_fn=my_collate
+    )
+    return dataloader
+def train_epoch(model, train_loader, optimizer, lr_scheduler, step):
+    loss_meter = AvgMeter()
+    tqdm_object = tqdm(train_loader, total=len(train_loader))
+    for batch in tqdm_object:
+        for k, v in batch.items():
+            batch[k] = v.to(CFG.device)
+        loss = model(batch)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        if step == "batch":
+            lr_scheduler.step()
+        count = batch["ms_bins"].size(0)
+        loss_meter.update(loss.item(), count)
+        tqdm_object.set_postfix(train_loss=loss_meter.avg, lr=get_lr(optimizer))
+    return loss_meter
+def valid_epoch(model, valid_loader):
+    loss_meter = AvgMeter()
+    tqdm_object = tqdm(valid_loader, total=len(valid_loader))
+    for batch in tqdm_object:
+        for k, v in batch.items():
+            batch[k] = v.to(CFG.device)
+        loss = model(batch)
+        count = batch["ms_bins"].size(0)
+        loss_meter.update(loss.item(), count)
+        tqdm_object.set_postfix(valid_loss=loss_meter.avg)
+    return loss_meter
+def main(data, cfg=CFG, savedir='data/train', encmodel=None, ratio=1):
+    setup_seed(cfg.seed)
+    train_set, valid_set = make_train_valid(data, valid_ratio=cfg.valid_ratio, seed=cfg.seed)
+    n = len(train_set)
+    if ratio < 1:
+        train_set = random.sample(train_set, int(n*ratio))
+        print(f'Ratio {ratio}, lenall {n}, newtrainset {len(train_set)}')
+    train_loader = build_loaders(train_set, "train", cfg, 10)
+    valid_loader = build_loaders(valid_set, "valid", cfg, 10)
+    step = "epoch"
+    best_loss = float('inf')
+    best_model_fn = ''
+    best_model_fns = []
+    model = FragSimiModel(cfg).to(cfg.device)
+    if not encmodel is None:
+        model.mol_gnn_encoder.load_state_dict(encmodel.mol_gnn_encoder.state_dict())
+        # fraze mol_gnn_encoder weights
+        '''for name, param in model.named_parameters():
+           if 'mol_gnn_encoder' in name:
+               print(152, 'fraze mol_gnn_encoder weights')
+               param.requires_grad = False'''
+    print(model)
+    optimizer = torch.optim.AdamW(
+        model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay
+    )
+    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", patience=cfg.patience, factor=cfg.factor
+    )
+    for epoch in range(cfg.epochs):
+        print(f"Epoch: {epoch + 1}/{cfg.epochs}")
+        model.train()
+        train_loss = train_epoch(model, train_loader, optimizer, lr_scheduler, step)
+        model.eval()
+        with torch.no_grad():
+            valid_loss = valid_epoch(model, valid_loader)
+        if valid_loss.avg < best_loss:
+            best_loss = valid_loss.avg
+            best_model_fn = f"{savedir}/model-tloss{round(train_loss.avg, 3)}-vloss{round(valid_loss.avg, 3)}-epoch{epoch}.pth"
+            best_model_fn_base = best_model_fn.replace('.pth', '')
+            n = 1
+            while os.path.exists(best_model_fn):
+                best_model_fn = best_model_fn_base + f'-{n}.pth'
+                n += 1
+            checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'config': dict(CFG)}
+            best_model_fns.append(best_model_fn)
+            torch.save(checkpoint, best_model_fn)
+            print("Saved Best Model!")
+    best_model_fnl = []
+    for fn in best_model_fns:
+        if os.path.exists(fn):
+            best_model_fnl.append(fn)
+    for fn in best_model_fnl[:-cfg.keep_best_models_num]:
+        os.remove(fn)
+    best_model_fnl = best_model_fnl[-cfg.keep_best_models_num:]
+    print(best_model_fnl, best_loss)
+    return best_model_fnl, best_loss
+if __name__ == "__main__":
+    try:
+        conffn = sys.argv[1]
+        if conffn.endswith('.json'):
+            CFG.load(sys.argv[1])
+        elif conffn.endswith('.pth'):
+            dpath = CFG.dataset_path
+            d = torch.load(conffn)
+            CFG.load(d['config'])
+            CFG.dataset_path = dpath
+        print('Use config from', conffn)
+    except:
+        pass
+    try:
+        savedir = sys.argv[2]
+    except:
+        savedir = 'data/'
+    os.system('mkdir -p %s' %savedir)
+    mg = None
+    print(CFG)
+    if os.path.isdir(CFG.dataset_path):
+        data = [os.path.join(CFG.dataset_path, i) for i in os.listdir(CFG.dataset_path) if i.endswith('mgf')]
+    elif os.path.isfile(CFG.dataset_path):
+        if CFG.dataset_path.endswith('.pkl'):
+            data = pickle.load(open(CFG.dataset_path, 'rb'))
+        else:
+            data = json.load(open(CFG.dataset_path))
+            pklfn = CFG.dataset_path.replace('.json', '.pkl')
+            if not os.path.exists(pklfn):
+                pickle.dump(data, open(pklfn, 'wb'))
+    subdir = make_next_record_dir(savedir, f'train-')
+    os.system(f'cp -a *py {subdir}; cp -a GNN {subdir}')
+    CFG.save(f'{subdir}/config.json')
+    modelfnl, _ = main(data, CFG, subdir, mg)

code/utils.py ADDED Viewed

	@@ -0,0 +1,370 @@

+from rdkit import Chem
+from rdkit.Chem import AllChem, MACCSkeys
+from rdkit.Chem.rdmolops import FastFindRings
+from rdkit.Chem.rdMolDescriptors import CalcMolFormula
+import torch
+import numpy as np
+import scipy
+import scipy.sparse as ss
+import scipy.sparse.linalg
+import math
+import json
+import itertools as it
+import re
+from GNN import featurizer as ft
+import rdkit.RDLogger as rkl
+logger = rkl.logger()
+logger.setLevel(rkl.ERROR)
+import rdkit.rdBase as rkrb
+rkrb.DisableLog('rdApp.error')
+# 50w metabolites fpbit relative aboundance > 5%
+FPBitIdx = [1, 5, 13, 41, 69, 80, 84, 94, 114, 117, 118, 119, 125, 133, 145,
+            147, 191, 192, 197, 202, 222, 227, 231, 249, 283, 294, 310, 314,
+            322, 333, 352, 361, 378, 387, 389, 392, 401, 406, 441, 478, 486,
+            489, 519, 521, 524, 555, 561, 591, 598, 599, 610, 622, 650, 656,
+            667, 675, 677, 679, 680, 694, 695, 715, 718, 722, 729, 736, 739,
+            745, 750, 760, 775, 781, 787, 794, 798, 802, 807, 811, 823, 835,
+            841, 849, 869, 872, 874, 875, 881, 890, 896, 926, 935, 980, 991,
+            1004, 1009, 1017, 1019, 1027, 1028, 1035, 1037, 1039, 1057, 1060,
+            1066, 1070, 1077, 1088, 1097, 1114, 1126, 1136, 1142, 1143, 1145,
+            1152, 1154, 1160, 1162, 1171, 1181, 1195, 1199, 1202, 1218, 1234,
+            1236, 1243, 1257, 1267, 1274, 1279, 1283, 1292, 1294, 1309, 1313,
+            1323, 1325, 1349, 1356, 1357, 1366, 1380, 1381, 1385, 1386, 1391,
+            1399, 1436, 1440, 1441, 1444, 1452, 1454, 1457, 1475, 1476, 1477,
+            1480, 1487, 1516, 1536, 1544, 1558, 1564, 1573, 1599, 1602, 1604,
+            1607, 1619, 1648, 1670, 1683, 1693, 1716, 1722, 1737, 1738, 1745,
+            1747, 1750, 1754, 1755, 1764, 1781, 1803, 1808, 1810, 1816, 1838,
+            1844, 1847, 1855, 1860, 1866, 1873, 1905, 1911, 1917, 1921, 1923,
+            1928, 1933, 1950, 1951, 1970, 1977, 1980, 1984, 1991, 2002, 2033, 2034, 2038]
+class ConfigDict(dict):
+    '''
+    Makes a  dictionary behave like an object,with attribute-style access.
+    '''
+    def __getattr__(self, name):
+        try:
+            return self[name]
+        except:
+            raise AttributeError(name)
+    def __setattr__(self, name, value):
+        self[name] = value
+    def save(self, fn):
+        json.dump(self, open(fn, 'w'), indent=2)
+    def load_dict(self, dic):
+        for k, v in dic.items():
+            self[k] = v
+    def load(self, fn):
+        try:
+            d = json.load(open(fn, 'r'))
+            self.load_dict(d)
+        except Exception as e:
+            print(e)
+def conv_out_dim(length_in, kernel, stride, padding, dilation):
+    length_out = (length_in + 2 * padding - dilation * (kernel - 1) - 1)// stride + 1
+    return length_out
+def filter_ms(ms, thr=0.05, max_mz=2000):
+    mz = []
+    intn = []
+    maxi = 0
+    for m, i in ms:
+        if m < max_mz and i > maxi:
+            maxi = i
+    for m, i in ms:
+        if m < max_mz and i/maxi > thr:
+            mz.append(m)
+            intn.append(round(i/maxi*100, 2))
+    return mz, intn
+def calc_nls(ms, thr=0.05, max_mz=2000):
+    mz, intn = filter_ms(ms, thr=0.05, max_mz=2000)
+    nlmass = []
+    nlintn = []
+    for a, b in it.combinations(mz[::-1], 2):
+        nl = a - b
+        if 0 < nl < 200:
+            nlmass.append(round(nl, 5))
+            idxa = mz.index(a)
+            idxb = mz.index(b)
+            nlintn.append(round((intn[idxa]+intn[idxb])/2., 5))
+    nls = sorted(list(zip(nlmass, nlintn)))
+    return nls
+def ms_binner(ms, nls=[], min_mz=20, max_mz=2000, bin_size=0.05, add_nl=False, binary_intn=False):
+    """
+    Convert the given spectrum to a binned sparse SciPy vector.
+    Parameters
+    ----------
+    spectrum_mz : np.ndarray
+        The peak m/z values of the spectrum to be converted to a vector.
+    spectrum_intensity : np.ndarray
+        The peak intensities of the spectrum to be converted to a vector.
+    min_mz : float
+        The minimum m/z to include in the vector.
+    bin_size : float
+        The bin size in m/z used to divide the m/z range.
+    num_bins : int
+        The number of elements of which the vector consists.
+    Returns
+    -------
+    ss.csr_matrix
+        The binned spectrum vector.
+    """
+    if add_nl and not nls:
+        nls = calc_nls(ms, max_mz=max_mz)
+    nltensor = None
+    mz, intn = filter_ms(ms)
+    if add_nl:
+        nlmass = []
+        nlintn = []
+        if not nls:
+            nls = calc_nls(ms, max_mz=max_mz)
+        for m, i in nls:
+            if m < 200:
+                if binary_intn:
+                    i = 1
+                nlmass.append(m)
+                nlintn.append(i)
+        nlmass = np.array(nlmass)
+        nlintn = np.array(nlintn)
+        if len(nlintn) > 0:
+            nlintn = nlintn/nlintn.max()
+        num_nlbins = math.ceil((200) / bin_size)
+        #print('num_nlbins', num_nlbins)
+        nlbins = (nlmass / bin_size).astype(np.int32)
+        if len(nlmass) > 0:
+            vecnl = ss.csr_matrix(
+                (nlintn,
+                (np.repeat(0, len(nlintn)), nlbins)),
+                shape=(1, num_nlbins),
+                dtype=np.float32)
+            vecnl = (vecnl / scipy.sparse.linalg.norm(vecnl)*100)
+            nltensor = torch.FloatTensor(vecnl.todense()).view(-1)
+        else:
+            nltensor = torch.zeros(num_nlbins)
+    mz = np.array(mz)
+    keepidx = (mz <= max_mz)
+    mz = mz[keepidx]
+    intn = np.array(intn)
+    intn = intn[keepidx]
+    if binary_intn:
+        intn[intn > 0] = 1.0
+    elif len(intn) > 0:
+        intn = intn/intn.max()
+    num_bins = math.ceil((max_mz - min_mz) / bin_size)
+    #print('num_bins', num_bins)
+    bins = ((mz - min_mz) / bin_size).astype(np.int32)
+    #print(num_bins, intn, bins)
+    if len(mz) > 0:
+        vec = ss.csr_matrix(
+            (intn,
+            (np.repeat(0, len(intn)), bins)),
+            shape=(1, num_bins),
+            dtype=np.float32)
+        if not binary_intn:
+            vec = (vec / scipy.sparse.linalg.norm(vec)*100)
+        mstensor = torch.FloatTensor(vec.todense()).view(-1)
+    else:
+        mstensor = torch.zeros(num_bins)
+    if not nltensor is None:
+        return torch.cat([nltensor, mstensor], dim=0)
+    return mstensor
+def formula2vec(formula, elements=['C', 'H', 'O', 'N', 'P', 'S', 'P', 'F', 'Cl', 'Br']):
+    formula_p = re.findall(r'([A-Z][a-z]*)(\d*)', formula)
+    vec = np.zeros(len(elements))
+    for i in range(len(formula_p)):
+        ele = formula_p[i][0]
+        num = formula_p[i][1]
+        if num == '':
+            num = 1
+        else:
+            num = int(num)
+        if ele in elements:
+            vec[elements.index(ele)] += num
+    return np.array(vec)
+def mol_fp_encoder0(smiles, tp='rdkit', nbits=2048):
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        mol = Chem.MolFromSmiles(smiles, sanitize=False)
+        if not mol is None:
+            mol.UpdatePropertyCache()
+            FastFindRings(mol)
+    if mol is None:
+        return None, None
+    if tp == 'morgan':
+        fp_vec = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=nbits)
+        fp = np.frombuffer(fp_vec.ToBitString().encode(), 'u1') - ord('0')
+        fp = fp.tolist()
+    elif tp == 'morgan1':
+        fp_vec = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
+        fp = np.frombuffer(fp_vec.ToBitString().encode(), 'u1') - ord('0')
+        fp = fp[FPBitIdx].tolist()
+    elif tp == 'macc':
+        # MACCSkeys
+        fp_vec = MACCSkeys.GenMACCSKeys(mol)
+        fp = np.frombuffer(fp_vec.ToBitString().encode(), 'u1') - ord('0')
+        fp = fp.tolist()
+    elif tp == 'rdkit':
+        fp_vec = Chem.RDKFingerprint(mol, nBitsPerHash=1)
+        fp = np.frombuffer(fp_vec.ToBitString().encode(), 'u1') - ord('0')
+        fp = fp.tolist()
+    return torch.FloatTensor(fp), mol
+def mol_fp_encoder(smiles, tp='rdkit', nbits=2048):
+    fpenc, _ = mol_fp_encoder0(smiles, tp, nbits)
+    return fpenc
+def mol_fp_fm_encoder(smiles, tp='rdkit', nbits=2048):
+    fmenc = None
+    fpenc, mol = mol_fp_encoder0(smiles, tp, nbits)
+    if not mol is None:
+        fm = CalcMolFormula(mol)
+        fmenc = torch.FloatTensor(formula2vec(fm))
+    return fpenc, fmenc
+def smi2fmvec(smiles):
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        return None
+    fm = CalcMolFormula(mol)
+    fmenc = torch.FloatTensor(formula2vec(fm))
+    return fmenc
+def mol_graph_featurizer(smiles):
+    # mol_graph = {V, A, mol_size}
+    '''mol_graph = ft.calc_data_from_smile(smiles,
+                                        addh=True,
+                                        with_ring_conj=True,
+                                        with_atom_feats=True,
+                                        with_submol_fp=True,
+                                        radius=2)
+    '''
+    mol_graph = ft.calc_data_from_smile(smiles,
+                                        addh=False,
+                                        with_ring_conj=True,
+                                        with_atom_feats=True,
+                                        with_submol_fp=False,
+                                        radius=2)
+    return mol_graph
+def pad_V(V, max_n):
+    N, C = V.shape
+    if max_n > N:
+        zeros = torch.zeros(max_n-N, C)
+        V = torch.cat([V, zeros], dim=0)
+    return V
+def pad_A(A, max_n):
+    N, L, _ = A.shape
+    if max_n > N:
+        zeros = torch.zeros(N, L, max_n-N)
+        A = torch.cat([A, zeros], dim=-1)
+        zeros = torch.zeros(max_n-N, L, max_n)
+        A = torch.cat([A, zeros], dim=0)
+    return A
+class AvgMeter:
+    def __init__(self, name="Metric"):
+        self.name = name
+        self.reset()
+    def reset(self):
+        self.avg, self.sum, self.count = [0] * 3
+    def update(self, val, count=1):
+        self.count += count
+        self.sum += val * count
+        self.avg = self.sum / self.count
+    def __repr__(self):
+        text = f"{self.name}: {self.avg:.4f}"
+        return text
+def get_lr(optimizer):
+    for param_group in optimizer.param_groups:
+        return param_group["lr"]
+def segment_max(x, size_list):
+    size_list = [int(i) for i in size_list]
+    return torch.stack([torch.max(v, 0).values for v in torch.split(x, size_list)])
+def segment_sum(x, size_list):
+    size_list = [int(i) for i in size_list]
+    return torch.stack([torch.sum(v, 0) for v in torch.split(x, size_list)])
+def segment_softmax(gate, size_list):
+    segmax = segment_max(gate, size_list)
+    # expand segmax shape to alpha shape
+    segmax_expand = torch.cat([segmax[i].repeat(n,1) for i,n in enumerate(size_list)], dim=0)
+    subtract = gate - segmax_expand
+    exp = torch.exp(subtract)
+    segsum = segment_sum(exp, size_list)
+    # expand segmax shape to alpha shape
+    segsum_expand = torch.cat([segsum[i].repeat(n,1) for i,n in enumerate(size_list)], dim=0)
+    attention = exp / (segsum_expand + 1e-16)
+    return attention
+def pad_ms_list(ms_list, thr=0.05, min_mz=20, max_mz=2000):
+    thr = thr*100
+    mslst = []
+    for ms in ms_list:
+        ms = np.array(ms)
+        ms[:,1] = ms[:,1]/ms[:,1].max()*100
+        if thr > 0:
+            ms = ms[(ms[:,1] >= thr)]
+        ms = ms[(ms[:,0] >= min_mz)]
+        ms = ms[(ms[:,0] <= max_mz)]
+        mslst.append(ms)
+    size_list = [ms.shape[0] for ms in mslst]
+    maxlen = max(size_list)
+    l = []
+    for ms in mslst:
+        extn = maxlen-len(ms)
+        if extn > 0:
+            l.append(np.concatenate([ms, [[0,0]]*extn], axis=0))
+        else:
+            l.append(ms)
+    return torch.FloatTensor(np.stack(l)), torch.IntTensor(size_list)