add functions

Browse files

Files changed (13) hide show

README.md +2 -2
functions/binding.py +186 -0
functions/hemolysis.py +69 -0
functions/nonfouling.py +69 -0
functions/permeability.py +167 -0
functions/solubility.py +68 -0
functions/tokenizer/__pycache__/my_tokenizers.cpython-310.pyc +0 -0
functions/tokenizer/my_tokenizers.py +398 -0
functions/tokenizer/new_splits.txt +159 -0
functions/tokenizer/new_vocab.txt +586 -0
scoring_functions.py +103 -0
train/binary_xg.py +223 -0
train/permeability_xg.py +186 -0

README.md CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b72853bda66e29cdc787331b4373ad6575f86092f05e4caa775fd50f7cbcda2e
-size 206

 version https://git-lfs.github.com/spec/v1
+oid sha256:8b4a57e9caf84b0991a9a349cb28b44049995f4a51ccc3118a0114baf856f36a
+size 839

functions/binding.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import torch
+import pandas as pd
+import torch.nn as nn
+import esm
+from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+from transformers import AutoModelForMaskedLM, AutoModelForCausalLM, AutoTokenizer, AutoModel
+base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
+class ImprovedBindingPredictor(nn.Module):
+    def __init__(self,
+                 esm_dim=1280,
+                 smiles_dim=768,
+                 hidden_dim=512,
+                 n_heads=8,
+                 n_layers=3,
+                 dropout=0.1):
+        super().__init__()
+        # Define binding thresholds
+        self.tight_threshold = 7.5    # Kd/Ki/IC50 ≤ ~30nM
+        self.weak_threshold = 6.0     # Kd/Ki/IC50 > 1μM
+        # Project to same dimension
+        self.smiles_projection = nn.Linear(smiles_dim, hidden_dim)
+        self.protein_projection = nn.Linear(esm_dim, hidden_dim)
+        self.protein_norm = nn.LayerNorm(hidden_dim)
+        self.smiles_norm = nn.LayerNorm(hidden_dim)
+        # Cross attention blocks with layer norm
+        self.cross_attention_layers = nn.ModuleList([
+            nn.ModuleDict({
+                'attention': nn.MultiheadAttention(hidden_dim, n_heads, dropout=dropout),
+                'norm1': nn.LayerNorm(hidden_dim),
+                'ffn': nn.Sequential(
+                    nn.Linear(hidden_dim, hidden_dim * 4),
+                    nn.ReLU(),
+                    nn.Dropout(dropout),
+                    nn.Linear(hidden_dim * 4, hidden_dim)
+                ),
+                'norm2': nn.LayerNorm(hidden_dim)
+            }) for _ in range(n_layers)
+        ])
+        # Prediction heads
+        self.shared_head = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+        )
+        # Regression head
+        self.regression_head = nn.Linear(hidden_dim, 1)
+        # Classification head (3 classes: tight, medium, loose binding)
+        self.classification_head = nn.Linear(hidden_dim, 3)
+    def get_binding_class(self, affinity):
+        """Convert affinity values to class indices
+        0: tight binding (>= 7.5)
+        1: medium binding (6.0-7.5)
+        2: weak binding (< 6.0)
+        """
+        if isinstance(affinity, torch.Tensor):
+            tight_mask = affinity >= self.tight_threshold
+            weak_mask = affinity < self.weak_threshold
+            medium_mask = ~(tight_mask | weak_mask)
+            classes = torch.zeros_like(affinity, dtype=torch.long)
+            classes[medium_mask] = 1
+            classes[weak_mask] = 2
+            return classes
+        else:
+            if affinity >= self.tight_threshold:
+                return 0  # tight binding
+            elif affinity < self.weak_threshold:
+                return 2  # weak binding
+            else:
+                return 1  # medium binding
+    def forward(self, protein_emb, smiles_emb):
+        protein = self.protein_norm(self.protein_projection(protein_emb))
+        smiles = self.smiles_norm(self.smiles_projection(smiles_emb))
+        #protein = protein.transpose(0, 1)
+        #smiles = smiles.transpose(0, 1)
+        # Cross attention layers
+        for layer in self.cross_attention_layers:
+            # Protein attending to SMILES
+            attended_protein = layer['attention'](
+                protein, smiles, smiles
+            )[0]
+            protein = layer['norm1'](protein + attended_protein)
+            protein = layer['norm2'](protein + layer['ffn'](protein))
+            # SMILES attending to protein
+            attended_smiles = layer['attention'](
+                smiles, protein, protein
+            )[0]
+            smiles = layer['norm1'](smiles + attended_smiles)
+            smiles = layer['norm2'](smiles + layer['ffn'](smiles))
+        # Get sequence-level representations
+        protein_pool = torch.mean(protein, dim=0)
+        smiles_pool = torch.mean(smiles, dim=0)
+        # Concatenate both representations
+        combined = torch.cat([protein_pool, smiles_pool], dim=-1)
+        # Shared features
+        shared_features = self.shared_head(combined)
+        regression_output = self.regression_head(shared_features)
+        classification_logits = self.classification_head(shared_features)
+        return regression_output, classification_logits
+class BindingAffinity:
+    def __init__(self, prot_seq, model_type='PeptideCLM'):
+        super().__init__()
+        # peptide embeddings
+        self.pep_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
+        self.pep_tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
+                                                    f'{base_path}/functions/tokenizer/new_splits.txt')
+        self.model = ImprovedBindingPredictor()
+        checkpoint = torch.load(f'{base_path}/src/binding/best_model.pt', weights_only=False)
+        self.model.load_state_dict(checkpoint['model_state_dict'])
+        self.model.eval()
+        self.esm_model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()  # load ESM-2 model
+        self.prot_tokenizer = alphabet.get_batch_converter() # load esm tokenizer
+        data = [("target", prot_seq)]
+        # get tokenized protein
+        _, _, prot_tokens = self.prot_tokenizer(data)
+        with torch.no_grad():
+            results = self.esm_model.forward(prot_tokens, repr_layers=[33])  # Example with ESM-2
+            prot_emb = results["representations"][33]
+        self.prot_emb = prot_emb[0]
+        self.prot_emb = torch.mean(self.prot_emb, dim=0, keepdim=True)
+    def forward(self, input_seqs):
+        with torch.no_grad():
+            scores = []
+            for seq in input_seqs:
+                pep_tokens = self.pep_tokenizer(seq, return_tensors='pt', padding=True)
+                with torch.no_grad():
+                    emb = self.pep_model(input_ids=pep_tokens['input_ids'],
+                                         attention_mask=pep_tokens['attention_mask'],
+                                         output_hidden_states=True)
+                #emb = self.pep_model(input_ids=pep_tokens['input_ids'], attention_mask=pep_tokens['attention_mask'])
+                pep_emb = emb.last_hidden_state.squeeze(0)
+                pep_emb = torch.mean(pep_emb, dim=0, keepdim=True)
+                score, logits = self.model.forward(self.prot_emb, pep_emb)
+                scores.append(score.item())
+        return scores
+    def __call__(self, input_seqs: list):
+        return self.forward(input_seqs)
+def unittest():
+    amhr = 'MLGSLGLWALLPTAVEAPPNRRTCVFFEAPGVRGSTKTLGELLDTGTELPRAIRCLYSRCCFGIWNLTQDRAQVEMQGCRDSDEPGCESLHCDPSPRAHPSPGSTLFTCSCGTDFCNANYSHLPPPGSPGTPGSQGPQAAPGESIWMALVLLGLFLLLLLLLGSIILALLQRKNYRVRGEPVPEPRPDSGRDWSVELQELPELCFSQVIREGGHAVVWAGQLQGKLVAIKAFPPRSVAQFQAERALYELPGLQHDHIVRFITASRGGPGRLLSGPLLVLELHPKGSLCHYLTQYTSDWGSSLRMALSLAQGLAFLHEERWQNGQYKPGIAHRDLSSQNVLIREDGSCAIGDLGLALVLPGLTQPPAWTPTQPQGPAAIMEAGTQRYMAPELLDKTLDLQDWGMALRRADIYSLALLLWEILSRCPDLRPDSSPPPFQLAYEAELGNTPTSDELWALAVQERRRPYIPSTWRCFATDPDGLRELLEDCWDADPEARLTAECVQQRLAALAHPQESHPFPESCPRGCPPLCPEDCTSIPAPTILPCRPQRSACHFSVQQGPCSRNPQPACTLSPV'
+    tfr = 'MMDQARSAFSNLFGGEPLSYTRFSLARQVDGDNSHVEMKLAVDEEENADNNTKANVTKPKRCSGSICYGTIAVIVFFLIGFMIGYLGYCKGVEPKTECERLAGTESPVREEPGEDFPAARRLYWDDLKRKLSEKLDSTDFTGTIKLLNENSYVPREAGSQKDENLALYVENQFREFKLSKVWRDQHFVKIQVKDSAQNSVIIVDKNGRLVYLVENPGGYVAYSKAATVTGKLVHANFGTKKDFEDLYTPVNGSIVIVRAGKITFAEKVANAESLNAIGVLIYMDQTKFPIVNAELSFFGHAHLGTGDPYTPGFPSFNHTQFPPSRSSGLPNIPVQTISRAAAEKLFGNMEGDCPSDWKTDSTCRMVTSESKNVKLTVSNVLKEIKILNIFGVIKGFVEPDHYVVVGAQRDAWGPGAAKSGVGTALLLKLAQMFSDMVLKDGFQPSRSIIFASWSAGDFGSVGATEWLEGYLSSLHLKAFTYINLDKAVLGTSNFKVSASPLLYTLIEKTMQNVKHPVTGQFLYQDSNWASKVEKLTLDNAAFPFLAYSGIPAVSFCFCEDTDYPYLGTTMDTYKELIERIPELNKVARAAAEVAGQFVIKLTHDVELNLDYERYNSQLLSFVRDLNQYRADIKEMGLSLQWLYSARGDFFRATSRLTTDFGNAEKTDRFVMKKLNDRVMRVEYHFLSPYVSPKESPFRHVFWGSGSHTLPALLENLKLRKQNNGAFNETLFRNQLALATWTIQGAANALSGDVWDIDNEF'
+    gfap = 'MERRRITSAARRSYVSSGEMMVGGLAPGRRLGPGTRLSLARMPPPLPTRVDFSLAGALNAGFKETRASERAEMMELNDRFASYIEKVRFLEQQNKALAAELNQLRAKEPTKLADVYQAELRELRLRLDQLTANSARLEVERDNLAQDLATVRQKLQDETNLRLEAENNLAAYRQEADEATLARLDLERKIESLEEEIRFLRKIHEEEVRELQEQLARQQVHVELDVAKPDLTAALKEIRTQYEAMASSNMHEAEEWYRSKFADLTDAAARNAELLRQAKHEANDYRRQLQSLTCDLESLRGTNESLERQMREQEERHVREAASYQEALARLEEEGQSLKDEMARHLQEYQDLLNVKLALDIEIATYRKLLEGEENRITIPVQTFSNLQIRETSLDTKSVSEGHLKRNIVVKTVEMRDGEVIKESKQEHKDVM'
+    glp1 = 'MAGAPGPLRLALLLLGMVGRAGPRPQGATVSLWETVQKWREYRRQCQRSLTEDPPPATDLFCNRTFDEYACWPDGEPGSFVNVSCPWYLPWASSVPQGHVYRFCTAEGLWLQKDNSSLPWRDLSECEESKRGERSSPEEQLLFLYIIYTVGYALSFSALVIASAILLGFRHLHCTRNYIHLNLFASFILRALSVFIKDAALKWMYSTAAQQHQWDGLLSYQDSLSCRLVFLLMQYCVAANYYWLLVEGVYLYTLLAFSVLSEQWIFRLYVSIGWGVPLLFVVPWGIVKYLYEDEGCWTRNSNMNYWLIIRLPILFAIGVNFLIFVRVICIVVSKLKANLMCKTDIKCRLAKSTLTLIPLLGTHEVIFAFVMDEHARGTLRFIKLFTELSFTSFQGLMVAILYCFVNNEVQLEFRKSWERWRLEHLHIQRDSSMKPLKCPTSSLSSGATAGSSMYTATCQASCS'
+    glast = 'MTKSNGEEPKMGGRMERFQQGVRKRTLLAKKKVQNITKEDVKSYLFRNAFVLLTVTAVIVGTILGFTLRPYRMSYREVKYFSFPGELLMRMLQMLVLPLIISSLVTGMAALDSKASGKMGMRAVVYYMTTTIIAVVIGIIIVIIIHPGKGTKENMHREGKIVRVTAADAFLDLIRNMFPPNLVEACFKQFKTNYEKRSFKVPIQANETLVGAVINNVSEAMETLTRITEELVPVPGSVNGVNALGLVVFSMCFGFVIGNMKEQGQALREFFDSLNEAIMRLVAVIMWYAPVGILFLIAGKIVEMEDMGVIGGQLAMYTVTVIVGLLIHAVIVLPLLYFLVTRKNPWVFIGGLLQALITALGTSSSSATLPITFKCLEENNGVDKRVTRFVLPVGATINMDGTALYEALAAIFIAQVNNFELNFGQIITISITATAASIGAAGIPQAGLVTMVIVLTSVGLPTDDITLIIAVDWFLDRLRTTTNVLGDSLGAGIVEHLSRHELKNRDVEMGNSVIEENEMKKPYQLIAQDNETEKPIDSETKM'
+    ncam = 'LQTKDLIWTLFFLGTAVSLQVDIVPSQGEISVGESKFFLCQVAGDAKDKDISWFSPNGEKLTPNQQRISVVWNDDSSSTLTIYNANIDDAGIYKCVVTGEDGSESEATVNVKIFQKLMFKNAPTPQEFREGEDAVIVCDVVSSLPPTIIWKHKGRDVILKKDVRFIVLSNNYLQIRGIKKTDEGTYRCEGRILARGEINFKDIQVIVNVPPTIQARQNIVNATANLGQSVTLVCDAEGFPEPTMSWTKDGEQIEQEEDDEKYIFSDDSSQLTIKKVDKNDEAEYICIAENKAGEQDATIHLKVFAKPKITYVENQTAMELEEQVTLTCEASGDPIPSITWRTSTRNISSEEKASWTRPEKQETLDGHMVVRSHARVSSLTLKSIQYTDAGEYICTASNTIGQDSQSMYLEVQYAPKLQGPVAVYTWEGNQVNITCEVFAYPSATISWFRDGQLLPSSNYSNIKIYNTPSASYLEVTPDSENDFGNYNCTAVNRIGQESLEFILVQADTPSSPSIDQVEPYSSTAQVQFDEPEATGGVPILKYKAEWRAVGEEVWHSKWYDAKEASMEGIVTIVGLKPETTYAVRLAALNGKGLGEISAASEF'
+    binding = BindingAffinity(tfr)
+    seq = ["CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@@H](N)Cc1c[nH]cn1)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)O"]
+    scores = binding(seq)
+    print(scores)
+if __name__ == '__main__':
+    unittest()

functions/hemolysis.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import sys
+import os
+import xgboost as xgb
+import torch
+import numpy as np
+from transformers import AutoModelForMaskedLM
+from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+import warnings
+import numpy as np
+from rdkit.Chem import Descriptors, rdMolDescriptors
+from rdkit import Chem, rdBase, DataStructs
+from rdkit.Chem import AllChem
+from typing import List
+rdBase.DisableLog('rdApp.error')
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
+class Hemolysis:
+    def __init__(self):
+        self.predictor = xgb.Booster(model_file=f'{base_path}/src/best_model_f1.json')
+        self.emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
+        self.tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
+                                              f'{base_path}/functions/tokenizer/new_splits.txt')
+    def generate_embeddings(self, sequences):
+        embeddings = []
+        for sequence in sequences:
+            tokenized = self.tokenizer(sequence, return_tensors='pt')
+            with torch.no_grad():
+                output = self.emb_model(**tokenized)
+            # Mean pooling across sequence length
+            embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
+            embeddings.append(embedding)
+        return np.array(embeddings)
+    def get_scores(self, input_seqs: list):
+        scores = np.ones(len(input_seqs))
+        features = self.generate_embeddings(input_seqs)
+        if len(features) == 0:
+            return scores
+        features = np.nan_to_num(features, nan=0.)
+        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
+        features = xgb.DMatrix(features)
+        probs = self.predictor.predict(features)
+        # return the probability of it being not hemolytic
+        return scores - probs
+    def __call__(self, input_seqs: list):
+        scores = self.get_scores(input_seqs)
+        return scores
+def unittest():
+    hemo = Hemolysis()
+    seq = ["NCC(=O)N[C@H](CS)C(=O)N[C@@H](CO)C(=O)NCC(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](c1ccc(cc1)F)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](CCCO)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CO)C(=O)O"]
+    scores = hemo(input_seqs=seq)
+    print(scores)
+if __name__ == '__main__':
+    unittest()

functions/nonfouling.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import sys
+import os
+import xgboost as xgb
+import torch
+import numpy as np
+from transformers import AutoModelForMaskedLM
+from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+import warnings
+import numpy as np
+from rdkit import Chem, rdBase, DataStructs
+from transformers import AutoModelForMaskedLM
+rdBase.DisableLog('rdApp.error')
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
+class Nonfouling:
+    def __init__(self):
+        self.predictor = xgb.Booster(model_file=f'{base_path}/src/nonfouling/best_model_f1.json')
+        self.emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
+        self.tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
+                                              f'{base_path}/functions/tokenizer/new_splits.txt')
+    def generate_embeddings(self, sequences):
+        embeddings = []
+        for sequence in sequences:
+            tokenized = self.tokenizer(sequence, return_tensors='pt')
+            with torch.no_grad():
+                output = self.emb_model(**tokenized)
+            # Mean pooling across sequence length
+            embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
+            embeddings.append(embedding)
+        return np.array(embeddings)
+    def get_scores(self, input_seqs: list):
+        scores = np.zeros(len(input_seqs))
+        features = self.generate_embeddings(input_seqs)
+        if len(features) == 0:
+            return scores
+        features = np.nan_to_num(features, nan=0.)
+        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
+        features = xgb.DMatrix(features)
+        scores = self.predictor.predict(features)
+        # return the probability of it being not hemolytic
+        return scores
+    def __call__(self, input_seqs: list):
+        scores = self.get_scores(input_seqs)
+        return scores
+def unittest():
+    nf = Nonfouling()
+    seq = ["NCC(=O)N[C@H](CS)C(=O)N[C@@H](CO)C(=O)NCC(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](c1ccc(cc1)F)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](CCCO)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CO)C(=O)O"]
+    scores = nf(input_seqs=seq)
+    print(scores)
+if __name__ == '__main__':
+    unittest()

functions/permeability.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import sys
+import os
+import xgboost as xgb
+import torch
+import numpy as np
+from transformers import AutoModelForMaskedLM
+from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+import warnings
+import numpy as np
+from rdkit.Chem import Descriptors, rdMolDescriptors
+from rdkit import Chem, rdBase, DataStructs
+from rdkit.Chem import AllChem
+from typing import List
+base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
+rdBase.DisableLog('rdApp.error')
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+def fingerprints_from_smiles(smiles: List, size=2048):
+    """ Create ECFP fingerprints of smiles, with validity check """
+    fps = []
+    valid_mask = []
+    for i, smile in enumerate(smiles):
+        mol = Chem.MolFromSmiles(smile)
+        valid_mask.append(int(mol is not None))
+        fp = fingerprints_from_mol(mol, size=size) if mol else np.zeros((1, size))
+        fps.append(fp)
+    fps = np.concatenate(fps, axis=0)
+    return fps, valid_mask
+def fingerprints_from_mol(molecule, radius=3, size=2048, hashed=False):
+    """ Create ECFP fingerprint of a molecule """
+    if hashed:
+        fp_bits = AllChem.GetHashedMorganFingerprint(molecule, radius, nBits=size)
+    else:
+        fp_bits = AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=size)
+    fp_np = np.zeros((1,))
+    DataStructs.ConvertToNumpyArray(fp_bits, fp_np)
+    return fp_np.reshape(1, -1)
+def getMolDescriptors(mol, missingVal=0):
+    """ calculate the full list of descriptors for a molecule """
+    values, names = [], []
+    for nm, fn in Descriptors._descList:
+        try:
+            val = fn(mol)
+        except:
+            val = missingVal
+        values.append(val)
+        names.append(nm)
+    custom_descriptors = {'hydrogen-bond donors': rdMolDescriptors.CalcNumLipinskiHBD,
+                          'hydrogen-bond acceptors': rdMolDescriptors.CalcNumLipinskiHBA,
+                          'rotatable bonds': rdMolDescriptors.CalcNumRotatableBonds,}
+    for nm, fn in custom_descriptors.items():
+        try:
+            val = fn(mol)
+        except:
+            val = missingVal
+        values.append(val)
+        names.append(nm)
+    return values, names
+def get_pep_dps_from_smi(smi):
+    try:
+        mol = Chem.MolFromSmiles(smi)
+    except:
+        print(f"convert smi {smi} to molecule failed!")
+        mol = None
+    dps, _ = getMolDescriptors(mol)
+    return np.array(dps)
+def get_pep_dps(smi_list):
+    if len(smi_list) == 0:
+        return np.zeros((0, 213))
+    return np.array([get_pep_dps_from_smi(smi) for smi in smi_list])
+def check_smi_validity(smiles: list):
+    valid_smi, valid_idx = [], []
+    for idx, smi in enumerate(smiles):
+        try:
+            mol = Chem.MolFromSmiles(smi) if smi else None
+            if mol:
+                valid_smi.append(smi)
+                valid_idx.append(idx)
+        except Exception as e:
+            # logger.debug(f'Error: {e} in smiles {smi}')
+            pass
+    return valid_smi, valid_idx
+class Permeability:
+    def __init__(self):
+        self.predictor = xgb.Booster(model_file=f'{base_path}/src/permeability/best_model.json')
+        self.emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
+        self.tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
+                                              f'{base_path}/functions/tokenizer/new_splits.txt')
+    def generate_embeddings(self, sequences):
+        embeddings = []
+        for sequence in sequences:
+            tokenized = self.tokenizer(sequence, return_tensors='pt')
+            with torch.no_grad():
+                output = self.emb_model(**tokenized)
+            # Mean pooling across sequence length
+            embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
+            embeddings.append(embedding)
+        return np.array(embeddings)
+    def get_features(self, input_seqs: list, dps=False, fps=False):
+        #valid_smiles, valid_idxes = check_smi_validity(input_seqs)
+        if fps:
+            fingerprints = fingerprints_from_smiles(input_seqs)[0]
+        else:
+            fingerprints = torch.empty((len(input_seqs), 0))
+        if dps:
+            descriptors = get_pep_dps(input_seqs)
+        else:
+            descriptors = torch.empty((len(input_seqs), 0))
+        embeddings = self.generate_embeddings(input_seqs)
+        # logger.debug(f'X_fps.shape: {X_fps.shape}, X_dps.shape: {X_dps.shape}')
+        features = np.concatenate([fingerprints, descriptors, embeddings], axis=1)
+        return features
+    def get_scores(self, input_seqs: list):
+        scores = -10 * np.ones(len(input_seqs))
+        features = self.get_features(input_seqs)
+        if len(features) == 0:
+            return scores
+        features = np.nan_to_num(features, nan=0.)
+        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
+        features = xgb.DMatrix(features)
+        scores = self.predictor.predict(features)
+        return scores
+    def __call__(self, input_seqs: list):
+        scores = self.get_scores(input_seqs)
+        return scores
+def unittest():
+    permeability = Permeability()
+    seq = ['N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1cNc2c1cc(O)cc2)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H]([C@@H](O)C(C)C)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](CC(=CN2)C1=C2C=CC=C1)C(=O)O']
+    scores = permeability(input_seqs=seq)
+    print(scores)
+if __name__ == '__main__':
+    unittest()

functions/solubility.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import sys
+import os
+import xgboost as xgb
+import torch
+import numpy as np
+from transformers import AutoModelForMaskedLM
+from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+import warnings
+import numpy as np
+from rdkit.Chem import Descriptors, rdMolDescriptors
+from rdkit import Chem, rdBase, DataStructs
+from rdkit.Chem import AllChem
+from typing import List
+from transformers import AutoModelForMaskedLM
+rdBase.DisableLog('rdApp.error')
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
+class Solubility:
+    def __init__(self):
+        self.predictor = xgb.Booster(model_file=f'{base_path}/src/solubility/best_model_f1.json')
+        self.emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
+        self.tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
+                                              f'{base_path}/functions/tokenizer/new_splits.txt')
+    def generate_embeddings(self, sequences):
+        embeddings = []
+        for sequence in sequences:
+            tokenized = self.tokenizer(sequence, return_tensors='pt')
+            with torch.no_grad():
+                output = self.emb_model(**tokenized)
+            # Mean pooling across sequence length
+            embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
+            embeddings.append(embedding)
+        return np.array(embeddings)
+    def get_scores(self, input_seqs: list):
+        scores = np.zeros(len(input_seqs))
+        features = self.generate_embeddings(input_seqs)
+        if len(features) == 0:
+            return scores
+        features = np.nan_to_num(features, nan=0.)
+        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
+        features = xgb.DMatrix(features)
+        scores = self.predictor.predict(features)
+        return scores
+    def __call__(self, input_seqs: list):
+        scores = self.get_scores(input_seqs)
+        return scores
+def unittest():
+    solubility = Solubility()
+    seq = ["NCC(=O)N[C@H](CS)C(=O)N[C@@H](CO)C(=O)NCC(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](c1ccc(cc1)F)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](CCCO)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CO)C(=O)O"]
+    scores = solubility(input_seqs=seq)
+    print(scores)
+if __name__ == '__main__':
+    unittest()

functions/tokenizer/__pycache__/my_tokenizers.cpython-310.pyc ADDED Viewed

Binary file (15.5 kB). View file

functions/tokenizer/my_tokenizers.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import collections
+import logging
+import os
+import re
+import codecs
+import unicodedata
+from typing import List, Optional
+from transformers import PreTrainedTokenizer
+from SmilesPE.tokenizer import SPE_Tokenizer
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+class Atomwise_Tokenizer(object):
+    """Run atom-level SMILES tokenization"""
+    def __init__(self):
+        """ Constructs a atom-level Tokenizer.
+        """
+        # self.regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
+        self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
+        self.regex = re.compile(self.regex_pattern)
+    def tokenize(self, text):
+        """ Basic Tokenization of a SMILES.
+        """
+        tokens = [token for token in self.regex.findall(text)]
+        return tokens
+class SMILES_SPE_Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`string`):
+            File containing the vocabulary.
+        spe_file (:obj:`string`):
+            File containing the trained SMILES Pair Encoding vocabulary.
+        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+    def __init__(self, vocab_file, spe_file,
+                unk_token="[UNK]",
+                sep_token="[SEP]",
+                pad_token="[PAD]",
+                cls_token="[CLS]",
+                mask_token="[MASK]",
+                **kwargs):
+        if not os.path.isfile(vocab_file):
+            raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file))
+        if not os.path.isfile(spe_file):
+            raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file))
+        self.vocab = load_vocab(vocab_file)
+        self.spe_vocab = open(spe_file, 'r', encoding='utf-8')
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab)
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs)
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text):
+        return self.spe_tokenizer.tokenize(text).split(' ')
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        text = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+        return self.convert_tokens_to_string(text)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(self, vocab_path):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
+        else:
+            vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`string`):
+            File containing the vocabulary.
+        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'.".format(vocab_file)
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.tokenizer = Atomwise_Tokenizer()
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text):
+        return self.tokenizer.tokenize(text)
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(self, vocab_path):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
+        else:
+            vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)

functions/tokenizer/new_splits.txt ADDED Viewed

	@@ -0,0 +1,159 @@

+c 1
+c 2
+c 3
+c 4
+c 5
+c 6
+c 7
+c 8
+c 9
+( c1
+( c2
+c1 )
+c2 )
+n 1
+n 2
+n 3
+n 4
+n 5
+n 6
+n 7
+n 8
+n 9
+( n1
+( n2
+n1 )
+n2 )
+O 1
+O 2
+O 3
+O 4
+O 5
+O 6
+O 7
+O 8
+O 9
+( O1
+( O2
+O2 )
+O2 )
+= O
+= C
+= c
+= N
+= n
+=C C
+=C N
+=C c
+=c c
+=N C
+=N c
+=n C
+=n c
+# N
+# C
+#N C
+#C C
+#C N
+#N N
+( C
+C )
+( O
+O )
+( N
+N )
+Br c
+( =O
+(=O )
+C (=O)
+C =O
+C =N
+C #N
+C #C
+C C
+CC C
+CC N
+CC O
+CC S
+CC c
+CC n
+C N
+CN C
+CN c
+C O
+CO C
+CO N
+CO c
+C S
+CS C
+CS S
+CS c
+C c
+Cl c
+C n
+F c
+N C
+NC C
+NC c
+N N
+N O
+N c
+N n
+O C
+OC C
+OC O
+OC c
+O N
+O O
+O c
+S C
+SC C
+SC c
+S S
+S c
+c c
+cc c
+cc n
+cc o
+cc s
+cc cc
+c n
+cn c
+cn n
+c o
+co c
+c s
+cs c
+cs n
+n c
+nc c
+nc n
+nc o
+nc s
+n n
+nn c
+nn n
+n o
+no c
+no n
+n s
+ns c
+ns n
+o c
+oc c
+o n
+s c
+sc c
+sc n
+s n
+N P
+P N
+C P
+P C
+N S
+S N
+C S
+S C
+S P
+P S
+C I

functions/tokenizer/new_vocab.txt ADDED Viewed

	@@ -0,0 +1,586 @@

+[PAD]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+#
+%
+(
+)
++
+-
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+=
+@
+A
+B
+Br
+Brc
+C
+CC
+CCC
+CCN
+CCO
+CCS
+CCc
+CCn
+CN
+CNC
+CNc
+CO
+COC
+CON
+COc
+CS
+CSC
+CSS
+CSc
+Cc
+Cl
+Clc
+Cn
+F
+Fc
+H
+I
+K
+L
+M
+N
+NC
+NCC
+NCc
+NN
+NO
+Nc
+Nn
+O
+OC
+OCC
+OCO
+OCc
+ON
+OO
+Oc
+P
+R
+S
+SC
+SCC
+SCc
+SS
+Sc
+T
+X
+Z
+[
+\\
+(/
+]
+a
+b
+c
+cc
+ccc
+ccn
+cco
+ccs
+cn
+cnc
+cnn
+co
+coc
+cs
+csc
+csn
+e
+g
+i
+l
+n
+nc
+ncc
+ncn
+nco
+ncs
+nn
+nnc
+nnn
+no
+noc
+non
+ns
+nsc
+nsn
+o
+oc
+occ
+on
+p
+r
+s
+sc
+scc
+scn
+sn
+t
+c1
+c2
+c3
+c4
+c5
+c6
+c7
+c8
+c9
+n1
+n2
+n3
+n4
+n5
+n6
+n7
+n8
+n9
+O1
+O2
+O3
+O4
+O5
+O6
+O7
+O8
+O9
+(c1
+(c2
+c1)
+c2)
+(n1
+(n2
+n1)
+n2)
+(O1
+(O2
+O2)
+=O
+=C
+=c
+=N
+=n
+=CC
+=CN
+=Cc
+=cc
+=NC
+=Nc
+=nC
+=nc
+#C
+#CC
+#CN
+#N
+#NC
+#NN
+(C
+C)
+(O
+O)
+(N
+N)
+NP
+PN
+CP
+PC
+NS
+SN
+SP
+PS
+C(=O)
+(/Br)
+(/C#N)
+(/C)
+(/C=N)
+(/C=O)
+(/CBr)
+(/CC)
+(/CCC)
+(/CCF)
+(/CCN)
+(/CCO)
+(/CCl)
+(/CI)
+(/CN)
+(/CO)
+(/CS)
+(/Cl)
+(/F)
+(/I)
+(/N)
+(/NC)
+(/NCC)
+(/NO)
+(/O)
+(/OC)
+(/OCC)
+(/S)
+(/SC)
+(=C)
+(=C/C)
+(=C/F)
+(=C/I)
+(=C/N)
+(=C/O)
+(=CBr)
+(=CC)
+(=CCF)
+(=CCN)
+(=CCO)
+(=CCl)
+(=CF)
+(=CI)
+(=CN)
+(=CO)
+(=C\\C)
+(=C\\F)
+(=C\\I)
+(=C\\N)
+(=C\\O)
+(=N)
+(=N/C)
+(=N/N)
+(=N/O)
+(=NBr)
+(=NC)
+(=NCC)
+(=NCl)
+(=NN)
+(=NO)
+(=NOC)
+(=N\\C)
+(=N\\N)
+(=N\\O)
+(=O)
+(=S)
+(B)
+(Br)
+(C#C)
+(C#CC)
+(C#CI)
+(C#CO)
+(C#N)
+(C#SN)
+(C)
+(C=C)
+(C=CF)
+(C=CI)
+(C=N)
+(C=NN)
+(C=NO)
+(C=O)
+(C=S)
+(CBr)
+(CC#C)
+(CC#N)
+(CC)
+(CC=C)
+(CC=O)
+(CCBr)
+(CCC)
+(CCCC)
+(CCCF)
+(CCCI)
+(CCCN)
+(CCCO)
+(CCCS)
+(CCCl)
+(CCF)
+(CCI)
+(CCN)
+(CCNC)
+(CCNN)
+(CCNO)
+(CCO)
+(CCOC)
+(CCON)
+(CCS)
+(CCSC)
+(CCl)
+(CF)
+(CI)
+(CN)
+(CN=O)
+(CNC)
+(CNCC)
+(CNCO)
+(CNN)
+(CNNC)
+(CNO)
+(CNOC)
+(CO)
+(COC)
+(COCC)
+(COCI)
+(COCN)
+(COCO)
+(COF)
+(CON)
+(COO)
+(CS)
+(CSC)
+(CSCC)
+(CSCF)
+(CSO)
+(Cl)
+(F)
+(I)
+(N)
+(N=N)
+(N=NO)
+(N=O)
+(N=S)
+(NBr)
+(NC#N)
+(NC)
+(NC=N)
+(NC=O)
+(NC=S)
+(NCBr)
+(NCC)
+(NCCC)
+(NCCF)
+(NCCN)
+(NCCO)
+(NCCS)
+(NCCl)
+(NCNC)
+(NCO)
+(NCS)
+(NCl)
+(NN)
+(NN=O)
+(NNC)
+(NO)
+(NOC)
+(O)
+(OC#N)
+(OC)
+(OC=C)
+(OC=O)
+(OC=S)
+(OCBr)
+(OCC)
+(OCCC)
+(OCCF)
+(OCCI)
+(OCCN)
+(OCCO)
+(OCCS)
+(OCCl)
+(OCF)
+(OCI)
+(OCO)
+(OCOC)
+(OCON)
+(OCSC)
+(OCl)
+(OI)
+(ON)
+(OO)
+(OOC)
+(OOCC)
+(OOSN)
+(OSC)
+(P)
+(S)
+(SC#N)
+(SC)
+(SCC)
+(SCCC)
+(SCCF)
+(SCCN)
+(SCCO)
+(SCCS)
+(SCCl)
+(SCF)
+(SCN)
+(SCOC)
+(SCSC)
+(SCl)
+(SI)
+(SN)
+(SN=O)
+(SO)
+(SOC)
+(SOOO)
+(SS)
+(SSC)
+(SSCC)
+([At])
+([O-])
+([O])
+([S-])
+(\\Br)
+(\\C#N)
+(\\C)
+(\\C=N)
+(\\C=O)
+(\\CBr)
+(\\CC)
+(\\CCC)
+(\\CCO)
+(\\CCl)
+(\\CF)
+(\\CN)
+(\\CNC)
+(\\CO)
+(\\COC)
+(\\Cl)
+(\\F)
+(\\I)
+(\\N)
+(\\NC)
+(\\NCC)
+(\\NN)
+(\\NO)
+(\\NOC)
+(\\O)
+(\\OC)
+(\\OCC)
+(\\ON)
+(\\S)
+(\\SC)
+(\\SCC)
+[Ag+]
+[Ag-4]
+[Ag]
+[Al-3]
+[Al]
+[As+]
+[AsH3]
+[AsH]
+[As]
+[At]
+[B-]
+[B@-]
+[B@@-]
+[BH-]
+[BH2-]
+[BH3-]
+[B]
+[Ba]
+[Br+2]
+[BrH]
+[Br]
+[C+]
+[C-]
+[C@@H]
+[C@@]
+[C@H]
+[C@]
+[CH-]
+[CH2]
+[CH3]
+[CH]
+[C]
+[CaH2]
+[Ca]
+[Cl+2]
+[Cl+3]
+[Cl+]
+[Cs]
+[FH]
+[F]
+[H]
+[He]
+[I+2]
+[I+3]
+[I+]
+[IH]
+[I]
+[K]
+[Kr]
+[Li+]
+[LiH]
+[MgH2]
+[Mg]
+[N+]
+[N-]
+[N@+]
+[N@@+]
+[N@@]
+[N@]
+[NH+]
+[NH-]
+[NH2+]
+[NH3]
+[NH]
+[N]
+[Na]
+[O+]
+[O-]
+[OH+]
+[OH2]
+[OH]
+[O]
+[P+]
+[P@+]
+[P@@+]
+[P@@]
+[P@]
+[PH2]
+[PH]
+[P]
+[Ra]
+[Rb]
+[S+]
+[S-]
+[S@+]
+[S@@+]
+[S@@]
+[S@]
+[SH+]
+[SH2]
+[SH]
+[S]
+[Se+]
+[Se-2]
+[SeH2]
+[SeH]
+[Se]
+[Si@]
+[SiH2]
+[SiH]
+[Si]
+[SrH2]
+[TeH]
+[Te]
+[Xe]
+[Zn+2]
+[Zn-2]
+[Zn]
+[b-]
+[c+]
+[c-]
+[cH-]
+[cH]
+[c]
+[n+]
+[n-]
+[nH]
+[n]
+[o+]
+[s+]
+[se+]
+[se]
+[te+]
+[te]

scoring_functions.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import sys
+import io
+import subprocess
+import warnings
+import numpy as np
+import pandas as pd
+from typing import List
+from loguru import logger
+from tqdm import tqdm
+from rdkit import Chem, rdBase, DataStructs
+from rdkit.Chem import AllChem
+import torch
+from functions.binding.binding import BindingAffinity
+from functions.permeability.permeability import Permeability
+from functions.solubility.solubility import Solubility
+from functions.hemolysis.hemolysis import Hemolysis
+from functions.nonfouling.nonfouling import Nonfouling
+class ScoringFunctions:
+    def __init__(self, score_func_names=None, prot_seqs=[]):
+        """
+        Class for generating score vectors given generated sequence
+        Args:
+            score_func_names: list of scoring function names to be evaluated
+            score_weights: weights to scale scores (default: 1)
+            target_protein: sequence of target protein binder
+        """
+        if score_func_names is None:
+            # just do unmasking based on validity of peptide bonds
+            self.score_func_names = []
+        else:
+            self.score_func_names = score_func_names
+        # self.weights = np.array([1] * len(self.score_func_names) if score_weights is None else score_weights)
+        # binding affinities
+        self.target_protein = prot_seqs
+        print(len(prot_seqs))
+        if ('binding_affinity1' in score_func_names) and (len(prot_seqs) == 1):
+            binding_affinity1 = BindingAffinity(prot_seqs[0])
+            binding_affinity2 = None
+        elif ('binding_affinity1' in score_func_names) and ('binding_affinity2' in score_func_names) and (len(prot_seqs) == 2):
+            binding_affinity1 = BindingAffinity(prot_seqs[0])
+            binding_affinity2 = BindingAffinity(prot_seqs[1])
+        else:
+            print("here")
+            binding_affinity1 = None
+            binding_affinity2 = None
+        permeability = Permeability()
+        sol = Solubility()
+        nonfouling = Nonfouling()
+        hemo = Hemolysis()
+        self.all_funcs = {'binding_affinity1': binding_affinity1,
+                          'binding_affinity2': binding_affinity2,
+                          'permeability': permeability,
+                          'nonfouling': nonfouling,
+                          'solubility': sol,
+                          'hemolysis': hemo
+                          }
+    def forward(self, input_seqs):
+        scores = []
+        for i, score_func in enumerate(self.score_func_names):
+            score = self.all_funcs[score_func](input_seqs = input_seqs)
+            scores.append(score)
+        # convert to numpy arrays with shape (num_sequences, num_functions)
+        scores = np.float32(scores).T
+        return scores
+    def __call__(self, input_seqs: list):
+        return self.forward(input_seqs)
+def unittest():
+    amhr = 'MLGSLGLWALLPTAVEAPPNRRTCVFFEAPGVRGSTKTLGELLDTGTELPRAIRCLYSRCCFGIWNLTQDRAQVEMQGCRDSDEPGCESLHCDPSPRAHPSPGSTLFTCSCGTDFCNANYSHLPPPGSPGTPGSQGPQAAPGESIWMALVLLGLFLLLLLLLGSIILALLQRKNYRVRGEPVPEPRPDSGRDWSVELQELPELCFSQVIREGGHAVVWAGQLQGKLVAIKAFPPRSVAQFQAERALYELPGLQHDHIVRFITASRGGPGRLLSGPLLVLELHPKGSLCHYLTQYTSDWGSSLRMALSLAQGLAFLHEERWQNGQYKPGIAHRDLSSQNVLIREDGSCAIGDLGLALVLPGLTQPPAWTPTQPQGPAAIMEAGTQRYMAPELLDKTLDLQDWGMALRRADIYSLALLLWEILSRCPDLRPDSSPPPFQLAYEAELGNTPTSDELWALAVQERRRPYIPSTWRCFATDPDGLRELLEDCWDADPEARLTAECVQQRLAALAHPQESHPFPESCPRGCPPLCPEDCTSIPAPTILPCRPQRSACHFSVQQGPCSRNPQPACTLSPV'
+    tfr = 'MMDQARSAFSNLFGGEPLSYTRFSLARQVDGDNSHVEMKLAVDEEENADNNTKANVTKPKRCSGSICYGTIAVIVFFLIGFMIGYLGYCKGVEPKTECERLAGTESPVREEPGEDFPAARRLYWDDLKRKLSEKLDSTDFTGTIKLLNENSYVPREAGSQKDENLALYVENQFREFKLSKVWRDQHFVKIQVKDSAQNSVIIVDKNGRLVYLVENPGGYVAYSKAATVTGKLVHANFGTKKDFEDLYTPVNGSIVIVRAGKITFAEKVANAESLNAIGVLIYMDQTKFPIVNAELSFFGHAHLGTGDPYTPGFPSFNHTQFPPSRSSGLPNIPVQTISRAAAEKLFGNMEGDCPSDWKTDSTCRMVTSESKNVKLTVSNVLKEIKILNIFGVIKGFVEPDHYVVVGAQRDAWGPGAAKSGVGTALLLKLAQMFSDMVLKDGFQPSRSIIFASWSAGDFGSVGATEWLEGYLSSLHLKAFTYINLDKAVLGTSNFKVSASPLLYTLIEKTMQNVKHPVTGQFLYQDSNWASKVEKLTLDNAAFPFLAYSGIPAVSFCFCEDTDYPYLGTTMDTYKELIERIPELNKVARAAAEVAGQFVIKLTHDVELNLDYERYNSQLLSFVRDLNQYRADIKEMGLSLQWLYSARGDFFRATSRLTTDFGNAEKTDRFVMKKLNDRVMRVEYHFLSPYVSPKESPFRHVFWGSGSHTLPALLENLKLRKQNNGAFNETLFRNQLALATWTIQGAANALSGDVWDIDNEF'
+    gfap = 'MERRRITSAARRSYVSSGEMMVGGLAPGRRLGPGTRLSLARMPPPLPTRVDFSLAGALNAGFKETRASERAEMMELNDRFASYIEKVRFLEQQNKALAAELNQLRAKEPTKLADVYQAELRELRLRLDQLTANSARLEVERDNLAQDLATVRQKLQDETNLRLEAENNLAAYRQEADEATLARLDLERKIESLEEEIRFLRKIHEEEVRELQEQLARQQVHVELDVAKPDLTAALKEIRTQYEAMASSNMHEAEEWYRSKFADLTDAAARNAELLRQAKHEANDYRRQLQSLTCDLESLRGTNESLERQMREQEERHVREAASYQEALARLEEEGQSLKDEMARHLQEYQDLLNVKLALDIEIATYRKLLEGEENRITIPVQTFSNLQIRETSLDTKSVSEGHLKRNIVVKTVEMRDGEVIKESKQEHKDVM'
+    glp1 = 'MAGAPGPLRLALLLLGMVGRAGPRPQGATVSLWETVQKWREYRRQCQRSLTEDPPPATDLFCNRTFDEYACWPDGEPGSFVNVSCPWYLPWASSVPQGHVYRFCTAEGLWLQKDNSSLPWRDLSECEESKRGERSSPEEQLLFLYIIYTVGYALSFSALVIASAILLGFRHLHCTRNYIHLNLFASFILRALSVFIKDAALKWMYSTAAQQHQWDGLLSYQDSLSCRLVFLLMQYCVAANYYWLLVEGVYLYTLLAFSVLSEQWIFRLYVSIGWGVPLLFVVPWGIVKYLYEDEGCWTRNSNMNYWLIIRLPILFAIGVNFLIFVRVICIVVSKLKANLMCKTDIKCRLAKSTLTLIPLLGTHEVIFAFVMDEHARGTLRFIKLFTELSFTSFQGLMVAILYCFVNNEVQLEFRKSWERWRLEHLHIQRDSSMKPLKCPTSSLSSGATAGSSMYTATCQASCS'
+    glast = 'MTKSNGEEPKMGGRMERFQQGVRKRTLLAKKKVQNITKEDVKSYLFRNAFVLLTVTAVIVGTILGFTLRPYRMSYREVKYFSFPGELLMRMLQMLVLPLIISSLVTGMAALDSKASGKMGMRAVVYYMTTTIIAVVIGIIIVIIIHPGKGTKENMHREGKIVRVTAADAFLDLIRNMFPPNLVEACFKQFKTNYEKRSFKVPIQANETLVGAVINNVSEAMETLTRITEELVPVPGSVNGVNALGLVVFSMCFGFVIGNMKEQGQALREFFDSLNEAIMRLVAVIMWYAPVGILFLIAGKIVEMEDMGVIGGQLAMYTVTVIVGLLIHAVIVLPLLYFLVTRKNPWVFIGGLLQALITALGTSSSSATLPITFKCLEENNGVDKRVTRFVLPVGATINMDGTALYEALAAIFIAQVNNFELNFGQIITISITATAASIGAAGIPQAGLVTMVIVLTSVGLPTDDITLIIAVDWFLDRLRTTTNVLGDSLGAGIVEHLSRHELKNRDVEMGNSVIEENEMKKPYQLIAQDNETEKPIDSETKM'
+    ncam = 'LQTKDLIWTLFFLGTAVSLQVDIVPSQGEISVGESKFFLCQVAGDAKDKDISWFSPNGEKLTPNQQRISVVWNDDSSSTLTIYNANIDDAGIYKCVVTGEDGSESEATVNVKIFQKLMFKNAPTPQEFREGEDAVIVCDVVSSLPPTIIWKHKGRDVILKKDVRFIVLSNNYLQIRGIKKTDEGTYRCEGRILARGEINFKDIQVIVNVPPTIQARQNIVNATANLGQSVTLVCDAEGFPEPTMSWTKDGEQIEQEEDDEKYIFSDDSSQLTIKKVDKNDEAEYICIAENKAGEQDATIHLKVFAKPKITYVENQTAMELEEQVTLTCEASGDPIPSITWRTSTRNISSEEKASWTRPEKQETLDGHMVVRSHARVSSLTLKSIQYTDAGEYICTASNTIGQDSQSMYLEVQYAPKLQGPVAVYTWEGNQVNITCEVFAYPSATISWFRDGQLLPSSNYSNIKIYNTPSASYLEVTPDSENDFGNYNCTAVNRIGQESLEFILVQADTPSSPSIDQVEPYSSTAQVQFDEPEATGGVPILKYKAEWRAVGEEVWHSKWYDAKEASMEGIVTIVGLKPETTYAVRLAALNGKGLGEISAASEF'
+    cereblon = 'MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNIINFDTSLPTSHTYLGADMEEFHGRTLHDDDSCQVIPVLPQVMMILIPGQTLPLQLFHPQEVSMVRNLIQKDRTFAVLAYSNVQEREAQFGTTAEIYAYREEQDFGIEIVKVKAIGRQRFKVLELRTQSDGIQQAKVQILPECVLPSTMSAVQLESLNKCQIFPSKPVSREDQCSYKWWQKYQKRKFHCANLTSWPRWLYSLYDAETLMDRIKKQLREWDENLKDDSLPSNPIDFSYRVAACLPIDDVLRIQLLKIGSAIQRLRCELDIMNKCTSLCCKQCQETEITTKNEIFSLSLCGPMAAYVNPHGYVHETLTVYKACNLNLIGRPSTEHSWFPGYAWTVAQCKICASHIGWKFTATKKDMSPQKFWGLTRSALLPTIPDTEDEISPDKVILCL'
+    num_iter = 0
+    score_func_times = [0, 1, 2, 3, 4, 5]
+    scoring = ScoringFunctions(score_func_names=['binding_affinity1', 'solubility', 'hemolysis', 'nonfouling', 'permeability'], prot_seqs=[tfr])
+    smiles = ['N2[C@H](CC(C)C)C(=O)N1[C@@H](CCC1)C(=O)N1[C@@H](CCC1)C(=O)N1[C@@H](CCC1)C(=O)N[C@@H](Cc1ccccc1C(F)(F)F)C(=O)N1[C@@H](CCC1)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(=O)N)C2(=O)']
+    scores = scoring(input_seqs=smiles)
+    print(scores)
+    print(len(scores))
+if __name__ == '__main__':
+    unittest()

train/binary_xg.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import pandas as pd
+import numpy as np
+import torch
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import precision_recall_curve, f1_score
+import optuna
+from optuna.trial import TrialState
+import xgboost as xgb
+import os
+from datasets import load_from_disk
+from lightning.pytorch import seed_everything
+from rdkit import Chem, rdBase, DataStructs
+from typing import List
+from rdkit.Chem import AllChem
+import matplotlib.pyplot as plt
+from sklearn.metrics import accuracy_score, roc_auc_score
+base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
+def save_and_plot_binary_predictions(y_true_train, y_pred_train, y_true_val, y_pred_val, threshold, output_path):
+    """
+    Saves the true and predicted values for training and validation sets, and generates binary classification plots.
+    Parameters:
+        y_true_train (array): True labels for the training set.
+        y_pred_train (array): Predicted probabilities for the training set.
+        y_true_val (array): True labels for the validation set.
+        y_pred_val (array): Predicted probabilities for the validation set.
+        threshold (float): Classification threshold for predictions.
+        output_path (str): Directory to save the CSV files and plots.
+    """
+    os.makedirs(output_path, exist_ok=True)
+    # Convert probabilities to binary predictions
+    y_pred_train_binary = (y_pred_train >= threshold).astype(int)
+    y_pred_val_binary = (y_pred_val >= threshold).astype(int)
+    # Save training predictions
+    train_df = pd.DataFrame({
+        'True Label': y_true_train,
+        'Predicted Probability': y_pred_train,
+        'Predicted Label': y_pred_train_binary
+    })
+    train_df.to_csv(os.path.join(output_path, 'train_predictions_binary.csv'), index=False)
+    # Save validation predictions
+    val_df = pd.DataFrame({
+        'True Label': y_true_val,
+        'Predicted Probability': y_pred_val,
+        'Predicted Label': y_pred_val_binary
+    })
+    val_df.to_csv(os.path.join(output_path, 'val_predictions_binary.csv'), index=False)
+    # Plot training predictions
+    plot_binary_correlation(
+        y_true_train,
+        y_pred_train,
+        threshold,
+        title="Training Set Binary Classification Plot",
+        output_file=os.path.join(output_path, 'train_classification_plot.png')
+    )
+    # Plot validation predictions
+    plot_binary_correlation(
+        y_true_val,
+        y_pred_val,
+        threshold,
+        title="Validation Set Binary Classification Plot",
+        output_file=os.path.join(output_path, 'val_classification_plot.png')
+    )
+def plot_binary_correlation(y_true, y_pred, threshold, title, output_file):
+    """
+    Generates a scatter plot for binary classification and saves it to a file.
+    Parameters:
+        y_true (array): True labels.
+        y_pred (array): Predicted probabilities.
+        threshold (float): Classification threshold for predictions.
+        title (str): Title of the plot.
+        output_file (str): Path to save the plot.
+    """
+    # Scatter plot
+    plt.figure(figsize=(10, 8))
+    plt.scatter(y_true, y_pred, alpha=0.5, label='Data points', color='#BC80FF')
+    # Add threshold line
+    plt.axhline(y=threshold, color='red', linestyle='--', label=f'Threshold = {threshold}')
+    # Add annotations
+    plt.title(title)
+    plt.xlabel("True Labels")
+    plt.ylabel("Predicted Probability")
+    plt.legend()
+    # Save and show the plot
+    plt.tight_layout()
+    plt.savefig(output_file)
+    plt.show()
+seed_everything(42)
+dataset = load_from_disk(f'{base_path}/data/solubility')
+sequences = np.stack(dataset['sequence'])  # Ensure sequences are SMILES strings
+labels = np.stack(dataset['labels'])
+embeddings = np.stack(dataset['embedding'])
+# Initialize best F1 score and model path
+best_f1 = -np.inf
+best_model_path = f"{base_path}/src/solubility"
+# Trial callback
+def trial_info_callback(study, trial):
+    if study.best_trial == trial:
+        print(f"Trial {trial.number}:")
+        print(f"  Weighted F1 Score: {trial.value}")
+def objective(trial):
+    # Define hyperparameters
+    params = {
+        'objective': 'binary:logistic',
+        'lambda': trial.suggest_float('lambda', 1e-8, 50.0, log=True),
+        'alpha': trial.suggest_float('alpha', 1e-8, 50.0, log=True),
+        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
+        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
+        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3),
+        'max_depth': trial.suggest_int('max_depth', 2, 15),
+        'min_child_weight': trial.suggest_int('min_child_weight', 1, 500),
+        'gamma': trial.suggest_float('gamma', 0, 10.0),
+        'tree_method': 'hist',
+        'device': 'cuda:6',
+    }
+    # Suggest number of boosting rounds
+    num_boost_round = trial.suggest_int('num_boost_round', 10, 1000)
+    threshold = 0.5  # Initial classification threshold
+    # Split the data
+    train_idx, val_idx = train_test_split(
+        np.arange(len(sequences)), test_size=0.2, stratify=labels, random_state=42
+    )
+    train_subset = dataset.select(train_idx).with_format("torch")
+    val_subset = dataset.select(val_idx).with_format("torch")
+    # Extract embeddings and labels for train/validation
+    train_embeddings = np.array(train_subset['embedding'])
+    valid_embeddings = np.array(val_subset['embedding'])
+    train_labels = np.array(train_subset['labels'])
+    valid_labels = np.array(val_subset['labels'])
+    # Prepare training and validation sets
+    dtrain = xgb.DMatrix(train_embeddings, label=train_labels)
+    dvalid = xgb.DMatrix(valid_embeddings, label=valid_labels)
+    # Train the model
+    model = xgb.train(
+        params=params,
+        dtrain=dtrain,
+        num_boost_round=num_boost_round,
+        evals=[(dvalid, "validation")],
+        early_stopping_rounds=50,
+        verbose_eval=False,
+    )
+    # Predict probabilities
+    preds_train = model.predict(dtrain)
+    preds_val = model.predict(dvalid)
+    # Calculate metrics
+    f1_val = f1_score(valid_labels, (preds_val >= threshold).astype(int), average="weighted")
+    auc_val = roc_auc_score(valid_labels, preds_val)
+    print(f"Trial {trial.number}: AUC: {auc_val:.3f}, F1 Score: {f1_val:.3f}")
+    # Save the model if it has the best F1 score
+    current_best = trial.study.user_attrs.get("best_f1", -np.inf)
+    if f1_val > current_best:
+        trial.study.set_user_attr("best_f1", f1_val)
+        trial.study.set_user_attr("best_auc", auc_val)
+        trial.study.set_user_attr("best_trial", trial.number)
+        os.makedirs(best_model_path, exist_ok=True)
+        # Save the model
+        model.save_model(os.path.join(best_model_path, "best_model_f1.json"))
+        print(f"✓ NEW BEST! Trial {trial.number}: F1={f1_val:.4f}, AUC={auc_val:.4f} - Model saved!")
+        # Save and plot binary predictions
+        save_and_plot_binary_predictions(
+            train_labels, preds_train, valid_labels, preds_val, threshold, best_model_path
+        )
+    return f1_val
+if __name__ == "__main__":
+    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
+    study.optimize(objective, n_trials=200)
+    # Prepare summary text
+    summary = []
+    summary.append("\n" + "="*60)
+    summary.append("OPTIMIZATION COMPLETE")
+    summary.append("="*60)
+    summary.append(f"Number of finished trials: {len(study.trials)}")
+    summary.append(f"\nBest Trial: #{study.user_attrs.get('best_trial', 'N/A')}")
+    summary.append(f"Best F1 Score: {study.user_attrs.get('best_f1', None):.4f}")
+    summary.append(f"Best AUC Score: {study.user_attrs.get('best_auc', None):.4f}")
+    summary.append(f"Optuna Best Trial Value: {study.best_trial.value:.4f}")
+    summary.append(f"\nBest hyperparameters:")
+    for key, value in study.best_trial.params.items():
+        summary.append(f"  {key}: {value}")
+    summary.append("="*60)
+    # Print to console
+    for line in summary:
+        print(line)
+    # Save to file
+    metrics_file = os.path.join(best_model_path, "optimization_metrics.txt")
+    with open(metrics_file, 'w') as f:
+        f.write('\n'.join(summary))
+    print(f"\n✓ Metrics saved to: {metrics_file}")

train/permeability_xg.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import pandas as pd
+import numpy as np
+import optuna
+from optuna.trial import TrialState
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from sklearn.metrics import mean_squared_error
+from sklearn.model_selection import train_test_split
+import xgboost as xgb
+import os
+from datasets import load_from_disk
+from scipy.stats import spearmanr
+import matplotlib.pyplot as plt
+base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
+def save_and_plot_predictions(y_true_train, y_pred_train, y_true_val, y_pred_val, output_path):
+    os.makedirs(output_path, exist_ok=True)
+    # Save training predictions
+    train_df = pd.DataFrame({'True Permeability': y_true_train, 'Predicted Permeability': y_pred_train})
+    train_df.to_csv(os.path.join(output_path, 'train_predictions.csv'), index=False)
+    # Save validation predictions
+    val_df = pd.DataFrame({'True Permeability': y_true_val, 'Predicted Permeability': y_pred_val})
+    val_df.to_csv(os.path.join(output_path, 'val_predictions.csv'), index=False)
+    # Plot training predictions
+    plot_correlation(
+        y_true_train,
+        y_pred_train,
+        title="Training Set Correlation Plot",
+        output_file=os.path.join(output_path, 'train_correlation.png'),
+    )
+    # Plot validation predictions
+    plot_correlation(
+        y_true_val,
+        y_pred_val,
+        title="Validation Set Correlation Plot",
+        output_file=os.path.join(output_path, 'val_correlation.png'),
+    )
+def plot_correlation(y_true, y_pred, title, output_file):
+    spearman_corr, _ = spearmanr(y_true, y_pred)
+    # Scatter plot
+    plt.figure(figsize=(10, 8))
+    plt.scatter(y_true, y_pred, alpha=0.5, label='Data points', color='#BC80FF')
+    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='teal', linestyle='--', label='Ideal fit')
+    # Add annotations
+    plt.title(f"{title}\nSpearman Correlation: {spearman_corr:.3f}")
+    plt.xlabel("True Permeability (logP)")
+    plt.ylabel("Predicted Affinity (logP)")
+    plt.legend()
+    # Save and show the plot
+    plt.tight_layout()
+    plt.savefig(output_file)
+    plt.show()
+# Load dataset
+dataset = load_from_disk(f'{base_path}/data/permeability')
+# Extract sequences, labels, and embeddings
+sequences = np.stack(dataset['sequence'])
+labels = np.stack(dataset['labels'])  # Regression labels
+embeddings = np.stack(dataset['embedding'])  # Pre-trained embeddings
+# Function to compute Morgan fingerprints
+def compute_morgan_fingerprints(smiles_list, radius=2, n_bits=2048):
+    fps = []
+    for smiles in smiles_list:
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is not None:
+            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
+            fps.append(np.array(fp))
+        else:
+            # If the SMILES string is invalid, use a zero vector
+            fps.append(np.zeros(n_bits))
+            print(f"Invalid SMILES: {smiles}")
+    return np.array(fps)
+# Compute Morgan fingerprints for the sequences
+#morgan_fingerprints = compute_morgan_fingerprints(sequences)
+# Concatenate embeddings with Morgan fingerprints
+#input_features = np.concatenate([embeddings, morgan_fingerprints], axis=1)
+input_features = embeddings
+# Initialize global variables
+best_model_path = f"{base_path}/src/permeability"
+os.makedirs(best_model_path, exist_ok=True)
+def trial_info_callback(study, trial):
+    if study.best_trial == trial:
+        print(f"Trial {trial.number}:")
+        print(f"  MSE: {trial.value}")
+def objective(trial):
+    # Define hyperparameters
+    params = {
+        'objective': 'reg:squarederror',
+        'lambda': trial.suggest_float('lambda', 0.1, 10.0, log=True),
+        'alpha': trial.suggest_float('alpha', 0.1, 10.0, log=True),
+        'gamma': trial.suggest_float('gamma', 0, 5),
+        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
+        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
+        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 0.1),
+        'max_depth': trial.suggest_int('max_depth', 2, 30),
+        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
+        'tree_method': 'hist',
+        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 10.0, log=True),
+        'device': 'cuda:6',
+    }
+    num_boost_round = trial.suggest_int('num_boost_round', 10, 1000)
+    # Train-validation split
+    X_train, X_val, y_train, y_val = train_test_split(input_features, labels, test_size=0.2, random_state=42)
+    # Convert data to DMatrix
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dvalid = xgb.DMatrix(X_val, label=y_val)
+    # Train XGBoost
+    model = xgb.train(
+        params=params,
+        dtrain=dtrain,
+        num_boost_round=num_boost_round,
+        evals=[(dvalid, "validation")],
+        early_stopping_rounds=50,
+        verbose_eval=False,
+    )
+    # Predict and evaluate
+    preds_train = model.predict(dtrain)
+    preds_val = model.predict(dvalid)
+    mse = mean_squared_error(y_val, preds_val)
+    # Calculate Spearman Rank Correlation for both train and validation
+    spearman_train, _ = spearmanr(y_train, preds_train)
+    spearman_val, _ = spearmanr(y_val, preds_val)
+    print(f"Train Spearman: {spearman_train:.4f}, Val Spearman: {spearman_val:.4f}")
+    # Save the best model
+    if trial.study.user_attrs.get("best_mse", np.inf) > mse:
+        trial.study.set_user_attr("best_mse", mse)
+        trial.study.set_user_attr("best_spearman_train", spearman_train)
+        trial.study.set_user_attr("best_spearman_val", spearman_val)
+        trial.study.set_user_attr("best_trial", trial.number)
+        model.save_model(os.path.join(best_model_path, "best_model.json"))
+        save_and_plot_predictions(y_train, preds_train, y_val, preds_val, best_model_path)
+        print(f"✓ NEW BEST! Trial {trial.number}: MSE={mse:.4f}, Train Spearman={spearman_train:.4f}, Val Spearman={spearman_val:.4f}")
+    return mse
+if __name__ == "__main__":
+    study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
+    study.optimize(objective, n_trials=200, callbacks=[trial_info_callback])
+    # Prepare summary text
+    summary = []
+    summary.append("\n" + "="*60)
+    summary.append("OPTIMIZATION COMPLETE")
+    summary.append("="*60)
+    summary.append(f"Number of finished trials: {len(study.trials)}")
+    summary.append(f"\nBest Trial: #{study.user_attrs.get('best_trial', 'N/A')}")
+    summary.append(f"Best MSE: {study.best_trial.value:.4f}")
+    summary.append(f"Best Training Spearman Correlation: {study.user_attrs.get('best_spearman_train', None):.4f}")
+    summary.append(f"Best Validation Spearman Correlation: {study.user_attrs.get('best_spearman_val', None):.4f}")
+    summary.append(f"\nBest hyperparameters:")
+    for key, value in study.best_trial.params.items():
+        summary.append(f"  {key}: {value}")
+    summary.append("="*60)
+    # Print to console
+    for line in summary:
+        print(line)
+    # Save to file
+    metrics_file = os.path.join(best_model_path, "optimization_metrics.txt")
+    with open(metrics_file, 'w') as f:
+        f.write('\n'.join(summary))
+    print(f"\n✓ Metrics saved to: {metrics_file}")