Spaces:
Running
Running
Commit ·
084b58f
1
Parent(s): 47e3259
Initial Monomerizer Space
Browse files- GPepT_analysis_pipeline.py +59 -0
- demo/example.svg +0 -0
- demo/example_GPepT_generated_sequences.txt +9 -0
- demo/example_smiles.txt +101 -0
- demo/example_smiles_IDs.txt +101 -0
- dictionary.txt +0 -0
- requirements.txt +7 -0
- run_pipeline.py +57 -0
- src/analyse.py +128 -0
- src/demonomerizer.py +211 -0
- src/draw.py +67 -0
- src/monomer_analyzer.py +1 -0
- src/monomerizer.py +882 -0
- src/prepare_GPepT_data.py +43 -0
- src/standardizer.py +98 -0
GPepT_analysis_pipeline.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
import subprocess
|
| 4 |
+
import sys
|
| 5 |
+
import datetime
|
| 6 |
+
|
| 7 |
+
def run_pipeline(sequence_file, output_dir, demonomerized_file, demonomerizer_args=None, analyse_args=None):
|
| 8 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 9 |
+
|
| 10 |
+
# Step 1: Run demonomerizer.py
|
| 11 |
+
print(f"Running demonomerizer.py... Input: {sequence_file}")
|
| 12 |
+
demonomerizer_command = [
|
| 13 |
+
sys.executable, "src/demonomerizer.py",
|
| 14 |
+
"--sequence_file", sequence_file,
|
| 15 |
+
"--NNAA_file", "dictionary.txt",
|
| 16 |
+
"--batch_size", "8",
|
| 17 |
+
"--output_dir", output_dir,
|
| 18 |
+
"--demonomerized_file", demonomerized_file
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
subprocess.run(demonomerizer_command, check=True)
|
| 22 |
+
|
| 23 |
+
demonomerized_path = os.path.join(output_dir, demonomerized_file)
|
| 24 |
+
|
| 25 |
+
# Step 2: Run analyse.py
|
| 26 |
+
print("Running analyse.py...")
|
| 27 |
+
analyse_command = [
|
| 28 |
+
sys.executable, "src/analyse.py",
|
| 29 |
+
"--mols_file", demonomerized_path,
|
| 30 |
+
"--input_dir", output_dir,
|
| 31 |
+
"--target_type", "peptides",
|
| 32 |
+
]
|
| 33 |
+
if analyse_args:
|
| 34 |
+
analyse_command.extend(analyse_args)
|
| 35 |
+
|
| 36 |
+
subprocess.run(analyse_command, check=True)
|
| 37 |
+
|
| 38 |
+
if __name__ == "__main__":
|
| 39 |
+
parser = argparse.ArgumentParser(description="Run the demonomerizer pipeline.")
|
| 40 |
+
parser.add_argument("--sequence_file", default="demonomerized.txt", help="Input sequence file")
|
| 41 |
+
parser.add_argument("--output_dir", default=f"output/{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}", help="Directory to store output")
|
| 42 |
+
parser.add_argument("--demonomerized_file", default="sequences_standardized.txt", help="Output demonomerized file name")
|
| 43 |
+
parser.add_argument("--batch_size", type=int, default=8, help="Batch size for demonomerizer.py")
|
| 44 |
+
parser.add_argument("-fetch_names", action="store_true", help="Fetch names from PubChem in analyse.py")
|
| 45 |
+
parser.add_argument("--target_type", default="ncAAs", help="Target type: ncAAs or peptides")
|
| 46 |
+
|
| 47 |
+
args = parser.parse_args()
|
| 48 |
+
|
| 49 |
+
# Args for demonomerizer
|
| 50 |
+
demonomerizer_args = ["--NNAA_file", "dictionary.txt", "--batch_size", str(args.batch_size)]
|
| 51 |
+
|
| 52 |
+
# Args for analyse
|
| 53 |
+
analyse_args = []
|
| 54 |
+
if args.fetch_names:
|
| 55 |
+
analyse_args.append("-fetch_names")
|
| 56 |
+
if args.target_type:
|
| 57 |
+
analyse_args.extend(["--target_type", args.target_type])
|
| 58 |
+
|
| 59 |
+
run_pipeline(args.sequence_file, args.output_dir, args.demonomerized_file, demonomerizer_args, analyse_args)
|
demo/example.svg
ADDED
|
|
demo/example_GPepT_generated_sequences.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
SEQUENCE
|
| 2 |
+
X7681VZ81
|
| 3 |
+
X1132RZ0
|
| 4 |
+
X369X2326Z0
|
| 5 |
+
X72AZ4941
|
| 6 |
+
X183PLGPGZ421
|
| 7 |
+
X2954AZ88
|
| 8 |
+
X34X6765X5Z11
|
| 9 |
+
X47WX47LFKKIGAVLKVLZ0
|
demo/example_smiles.txt
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
SMILES
|
| 2 |
+
CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)NO)NC(=O)OCc1ccccc1)C(C)C)[C@@H](O)CC(=O)NCCc1ccccc1
|
| 3 |
+
N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1cc(I)c(O)c(I)c1)C(N)=O
|
| 4 |
+
NC(=O)[C@@H]1C[C@H](NC(=O)C(F)(F)F)CN1C(=O)[C@H](N)CCC/N=C(\N)N[N+](=O)[O-]
|
| 5 |
+
CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CCCNC(=N)N)NC(=O)OCc1ccccc1)[C@@H](O)CC(=O)NC1CCCCC1
|
| 6 |
+
CC(C)C[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](N)Cc1ccccc1)C(=O)NCC(=O)N1CCC[C@H]1C(=O)NCC(=O)Nc1ccc(N(CCCl)CCCl)cc1
|
| 7 |
+
C[C@H](NC(=O)[C@@H](CO)NS(=O)(=O)c1ccccc1)C(=O)N[C@H]1CCCN(C(=N)N)C1O
|
| 8 |
+
C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](c2csc(-c3ccccc3)n2)CN1C(=O)[C@@H](NC(=O)OC1CCCC1)C(C)(C)C)C(=O)NS(=O)(=O)C1CC1
|
| 9 |
+
CC[C@H](C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCC[N+](C)(C)C)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](N)CCCC[N+](C)(C)C)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
|
| 10 |
+
CSCC[C@H](NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](C)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H](N)CC(C)C)[C@@H](C)O)C(=O)O
|
| 11 |
+
CSCC[C@H](NC(=O)[C@H](Cc1cnc[nH]1)NC(=O)[C@H](CO)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H](N)Cc1cnc[nH]1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)O
|
| 12 |
+
CC(C)(C)NC(=O)C1(C2CCCCC2)CCN(C(=O)[C@@H](Cc2ccc(F)cc2)NC(=O)[C@@H]2CNC3(CC3)CN2)CC1
|
| 13 |
+
CC(=O)O[C@H]1C(=O)[C@@]2(C)[C@H]([C@H](OC(=O)c3ccccc3)[C@]3(O)CC(OC(=O)[C@H](OC(=O)NCCNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc4ccccc4)NC(=O)[C@@H](C)N)C(NC(=O)c4ccccc4)c4ccccc4)C(C)=C1C3(C)C)[C@]1(OC(C)=O)CO[C@@H]1C[C@@H]2O
|
| 14 |
+
COC(=O)[C@@H](Cc1ccccc1)NC(=O)[C@H](NC(=O)CC(O)CC(CC(C)C)NC(=O)[C@@H](N)Cc1c[nH]cn1)C(C)C
|
| 15 |
+
CC[C@H](C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](N)CCCC[N+](C)(C)C)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C(C)C)C(O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCC[N+](C)(C)C)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C
|
| 16 |
+
CC1OC(SCCCCCCNC(=O)[C@H](CCC(N)=O)NC(=O)[C@@H](C)N)C(O)C(O)C1O
|
| 17 |
+
CC1(C)N([O])C(c2ccc(OCC(=O)NCCCNC(=O)[C@H](Cc3ccc(O)cc3)NC(=O)[C@@H](N)CCCNC(=N)N)cc2)=[N+]([O-])C1(C)C
|
| 18 |
+
CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CSCCC[P+](C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CSCCC[P+](C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)CSCCC[P+](C)(C)C)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(N)=O
|
| 19 |
+
CCN(CC)CCC(=O)NC(C(=O)N[C@H](Cc1cn(C=O)c2ccccc12)C(=O)N[C@@H](Cc1ccccc1)C(=O)N(C)Cc1ccccc1)C(C)O
|
| 20 |
+
N[C@@H](Cc1cc2ccccc2[nH]1)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)NCCCC(=O)NCC(=O)NCCCCCCOP(=O)(O)Oc1ccccc1Cl
|
| 21 |
+
C[C@@H](N)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCCCN)C(=O)NCCCCC(=O)OCCNc1nc(NCCc2ccccc2)c2cnn(/C=C/c3ccccc3)c2n1
|
| 22 |
+
CCCC[PH](CCCC)(CCCC)Cc1ccc(NC(=O)C2Cc3ccccc3CN2C(=O)[C@@H](N)CCc2ccccc2)cc1
|
| 23 |
+
CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)CCCCN)C(=O)N[C@@H](CCCCN)C(=O)O
|
| 24 |
+
Cc1cc(C(=O)N[C@H](C(=O)N[C@@H](Cc2ccc(F)cc2)C(=O)N[C@@H](/C=C/C(=O)OCc2nc3cc(Cl)ccc3[nH]2)CCC(N)=O)C(C)C)no1
|
| 25 |
+
CCN(CC)c1ccc2c(-c3ccc(S(=O)(=O)NCCCC[C@H](NC(=O)Cc4csc(=N)n4C)C(=O)N[C@@H](Cc4cn(Cc5ccccc5)c[n+]4C)C(=O)NC4CCN(C)CC4)cc3S(=O)(=O)[O-])c3ccc(=[N+](CC)CC)cc-3oc2c1
|
| 26 |
+
CC(C)C[C@H](N)C(=O)NCC(=O)N1CCC[C@H]1C(=O)NCC(=O)Nc1ccc(N(CCCl)CCCl)cc1
|
| 27 |
+
CC1=CC(C)=[N+]2C1=Cc1ccc(CCC(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(N)=O)n1[B-]2(F)F
|
| 28 |
+
N=C(N)NCCC[C@@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](CC(=O)O)NC(=O)[C@@H](CO)NC(=O)[C@@H](N)CC(=O)O)C(=O)O
|
| 29 |
+
CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@H]2CCCN2C[C@H]1C(=O)N[C@@H]1CCOc2ccccc21)C1CCC(F)(F)CC1
|
| 30 |
+
COc1ccc(NC(=O)[C@@H]2Cc3ccc(OCC(=O)NO)cc3CN2C(=O)[C@H](C)N)cc1
|
| 31 |
+
COc1cccc(COc2ccc([C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCNC(=N)N)NC(=O)c3cccs3)C(N)=O)cc2)c1
|
| 32 |
+
CC(C)CC(N)C(=O)NCC(=O)NC[C@H](C)B1OC2CC3CC(C3(C)C)[C@@]2(C)O1
|
| 33 |
+
CCN(CC)CCNC(=O)c1ccc(C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C=O)CC(C)C)c(NNN2CCCC2)c1
|
| 34 |
+
CC(O)C(NC(=O)CCN)C(=O)N[C@H](Cc1cn(C=O)c2ccccc12)C(=O)N[C@@H](Cc1ccccc1)C(=O)N(C)Cc1ccccc1
|
| 35 |
+
CCCN(CC(=O)N[C@H](C=O)CCCN=C(N)N)C(=O)[C@H]1CCCCN1
|
| 36 |
+
CC(C)C[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(C)C)C(C)C)C(=O)N[C@@H](CO)C(=O)O
|
| 37 |
+
CC(C)(N)C(=O)N[C@H](CCCc1ccccc1)C(=O)N1CCC2(CC1)CC(O)c1ccccc12
|
| 38 |
+
CC[C@H](C)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](N)CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CSc1ccc2n1[B-](F)(F)[N+]1=CC=CC1=C2)C(=O)O
|
| 39 |
+
O=C(NCCCCC[C@H](NC(=O)[C@@H]1C[C@@H](N2CCCCC2)CN1C(=O)[C@@H](CC1CCCCC1)NC(=O)c1ccc2ccccc2c1)B(O)O)NC1CCCCC1
|
| 40 |
+
CC(=O)N[C@@H](CCCO/N=C/c1ccccc1)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCC(=O)O)C(N)=O
|
| 41 |
+
COc1ccc(CC(C)(NC(=O)[C@@H]2CCCN2C(=O)CCCc2ccc(O)cc2)C(=O)NCCCN)cc1OC
|
| 42 |
+
N=C(N)N1CCC(C(NS(=O)(=O)Cc2ccccc2)C(=O)NCC(=O)N[C@H]2CCCN(C(=N)N)C2O)CC1
|
| 43 |
+
CCC(=O)NCCOCCOCCNC(=O)/N=C(\N)NCCC[C@H](NC(=O)[C@@H](N)CC(=O)O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1cnc[nH]1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(=O)O)[C@@H](C)CC)C(C)C
|
| 44 |
+
N=C(N)c1ccc(CNC(=O)[C@@H]2CCCN2C(=O)[C@H](N)C2CCCCC2)cc1
|
| 45 |
+
CC(C)C[C@H](NC(=O)CNC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)NCC(N)=O
|
| 46 |
+
CCCCCCCC(=O)OC[C@H](NC(=O)C(C)(C)N)C(=O)N1CCC2(CC1)CN(S(C)(=O)=O)c1ccccc12
|
| 47 |
+
COc1ccc(NC(C)=O)cc1C(=O)NNC(=O)[C@H](CCCCN)NC(=O)CCOC[C@H]1OC(OCCCNC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)OC(C)(C)C)C(C)C)[C@H](O)[C@@H](O)[C@@H]1O
|
| 48 |
+
CSCC[C@H](NC(=O)[C@@H]1Cc2ccccc2CN1)C(=O)NO
|
| 49 |
+
CSCC[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(C)=O)C(=O)NCC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](COS(=O)(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(N)=O
|
| 50 |
+
C=Cc1c(C)c2cc3nc(c4c5[nH]c(cc6nc(cc1[nH]2)C(C)=C6CC)c(C)c5C(=O)C4)[C@@H](CCC(=O)N[C@H](C(=O)N[C@@H](CO)C(=O)NCC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCCCNOC(=O)CCCN(C)c1ccc(/N=N/c2cccc4nc5ccc(N(CC)CC)cc5[n+](-c5ccccc5)c24)cc1)C(N)=O)[C@@H](C)O)[C@@H]3C
|
| 51 |
+
CC(=O)NC(Cc1ccc([N+](=O)[O-])cc1)C(=O)NCC(N)C(=O)c1ccccc1
|
| 52 |
+
CC(C)(C)[C@H](NC(=O)Cc1cc(Cl)cc(Cl)c1)C(=O)NCC(=O)NC/C=C/S(C)(=O)=O
|
| 53 |
+
CC(C)CCOc1ccc2ccccc2c1-c1c(OCC(=O)N[C@H](CCCCN)C(=O)N[C@H](CCCN)C(=O)N[C@@H](CC(C)C)C(=O)OCc2ccccc2)ccc2ccccc12
|
| 54 |
+
CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)c1[nH]c2ccc(OCCCCCN)cc2c1CCCCCCN)C(=O)N[C@@H](CC(C)C)C(=O)O
|
| 55 |
+
CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C)C(C)C
|
| 56 |
+
CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
|
| 57 |
+
CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
|
| 58 |
+
CC[C@H](C)[C@H](NC(=O)[C@@H](N)C(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N1CCC[C@H]1C(=O)N[C@@H](C)C(=O)NCC(=O)O
|
| 59 |
+
CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C
|
| 60 |
+
CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
|
| 61 |
+
CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
|
| 62 |
+
CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
|
| 63 |
+
COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)C[C@@](O)(C(=O)CO)C[C@@H]3O[C@H]1C[C@H]2[C@H](OCN2C(=O)OCc2ccc(NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc3ccccc3)NC(=O)[C@@H](C)N)cc2)[C@H](C)O1
|
| 64 |
+
CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)CC(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
|
| 65 |
+
CC(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(N)=O)C(C)C)C(C)C
|
| 66 |
+
CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)c1[nH]c2c(OCCCCCN=C(N)N)cccc2c1CCCCCCN=C(N)N)C(=O)N[C@@H](CC(C)C)C(=O)O
|
| 67 |
+
CCN(CC(=O)NCC(=O)Nc1cccc(C)c1C)Cc1ccccc1
|
| 68 |
+
CCn1c(-c2nonc2N)nc2c(C#CC(C)(C)O)ncc(OCCCNCCCC(=O)NCCOCCOCCOCCC(=O)N[C@H](C(=O)N3C[C@H](O)C[C@H]3C(=O)NCc3ccc(-c4scnc4C)cc3)C(C)(C)C)c21
|
| 69 |
+
Cc1cc2c(s1)-n1c(C)nnc1[C@H](CC(=O)NCCOCCCOCC(=O)N[C@@H](C(=O)N1C[C@H](O)C[C@H]1C(=O)NCc1ccc(-c3scnc3C)cc1)C(C)(C)C)N=C2c1ccc(Cl)cc1
|
| 70 |
+
NC(=O)[C@@H]1C[C@H](NC(=O)[C@H](N)CCC/N=C(\N)N[N+](=O)[O-])CN1
|
| 71 |
+
CC(C)C[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)C[C@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1ccccc1)C(N)=O
|
| 72 |
+
CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCCNC(=N)N)NC(=O)CC(c1ccccc1)c1ccccc1)[C@@H](C)O)[C@@H](C)CC)C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](Cc1cn(CCN2CCCc3cc(/C=C/C4=C(Br)C(/C=C/c5cc6c7c(c5)CCCN7CCC6)=[O+][B-](F)(C(F)(F)F)O4)ccc32)nn1)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O)[C@@H](C)CC)[C@@H](C)CC
|
| 73 |
+
Cc1ncsc1-c1ccc(C2(NC(=O)[C@@H]3C[C@@H](O)CN3C(=O)[C@@H](NC(=O)[C@H]3CC4(C3)C[C@H](N3CCC(c5cnc(N6C7CCC6CN(c6cc(-c8ccccc8O)nnc6N)C7)nc5)CC3)C4)C(C)(C)C)CC2)cc1
|
| 74 |
+
Cc1cc(C)c(CNC(=O)c2cc(-c3ccc(N4CCN(C(=O)CCCCCn5cc(CCCCCC(=O)N[C@@H](C(=O)N6C[C@H](O)C[C@H]6C(=O)NCc6ccc(-c7scnc7C)cc6)C(C)(C)C)nn5)CC4)nc3)cc3c2cnn3C(C)C)c(=O)[nH]1
|
| 75 |
+
CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(C)=O)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(=O)O)[C@@H](C)CC
|
| 76 |
+
CC[C@H](C)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](C)NC(=O)[C@@H](NC(=O)[C@H](CNC(=O)c1cccc(S(=O)(=O)F)c1)NC(C)=O)[C@@H](C)CC)C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(N)=O
|
| 77 |
+
CC(C)[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](Cc1cccc(Cl)c1)NC(=O)C1CCN(C)CC1)C(=O)C(F)(F)F
|
| 78 |
+
COc1ccc([C@H](NC(=O)[C@H](Cc2cccc(Cl)c2)NC(=O)c2ccc(CNC(=O)OC(C)(C)C)c(Cl)c2)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
|
| 79 |
+
COc1ccc([C@H](NC(=O)[C@H](Cc2cccc(Cl)c2)NC(=O)c2cccc(OCC(=O)OC(C)(C)C)c2)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
|
| 80 |
+
COc1ccc([C@H](NC(=O)[C@H](C)NC(=O)C(c2ccc(Cl)cc2)C(C)C)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
|
| 81 |
+
CC(C)C[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CCCCN)NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CCCN1C(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](Cc1c[nH]cn1)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](N)CC(C)C)C(=O)O
|
| 82 |
+
CSCC[C@H](NC(=O)[C@H](C)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCCCN)C(C)C)[C@@H](C)O)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CS)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](CC(C)C)C(=O)O
|
| 83 |
+
CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](N)CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O
|
| 84 |
+
Cc1oc2c(c(C)cc3oc(=O)c(CC(=O)NCC(=O)NCC(=O)NCC(C)O)c(C)c32)c1C
|
| 85 |
+
CC[C@H](C)[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(N)=O)NC(C)=O)[C@@H](C)CC)[C@@H](C)CC)[C@@H](C)CC)[C@@H](C)CC)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(N)=O
|
| 86 |
+
Cc1cc(Cn2c(N3CC4(CNC4)C3)nc3c(N4CCN(CCCC(=O)NCCCC(=O)N[C@H](C(=O)N5C[C@H](O)C[C@H]5C(=O)N[C@@H](C)c5ccc(-c6scnc6C)cc5)C(C)(C)C)CC4)cc(Cl)cc32)cc(C)c1F
|
| 87 |
+
Cn1ccc(-c2cc(Cl)c(Cl)c3[nH]c4c(c23)CN(C(=O)CNC(=O)CN2CCNCC2)CC4)n1
|
| 88 |
+
CN(C)CCCCCNC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](N)Cc1c[nH]cn1
|
| 89 |
+
CN(Cc1ccc2ccccc2c1)C(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]1CCCN1/C(S)=N/Cc1ccccc1Cl
|
| 90 |
+
CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)CNC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)N[C@@H](CCCN=C(N)N)C(=O)O
|
| 91 |
+
CSCC[C@H](NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CO)NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@H](CCCN=C(N)N)C(=O)O
|
| 92 |
+
O=C(N[C@H](CC1CCCCC1)C(=O)N1C[C@H](N2CCCCC2)C[C@H]1C(=O)N[C@@H](CCCCN1CC2(CSC2)C1)B(O)O)c1ccc2ccccc2c1
|
| 93 |
+
N=C(N)NCCCC[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](Cc1ccccc1)NS(=O)(=O)Cc1ccccc1)C(=O)Cc1ccccc1
|
| 94 |
+
COC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)CCCCC(=O)NC[C@@H]1CCN2CC[C@@H](CO[Si](c3ccccc3)(c3ccccc3)C(C)(C)C)N=C2N1)[C@@H](C)O
|
| 95 |
+
C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](Oc2cc(-c3ccccc3)nc3cc(OC)ccc23)CN1C(=O)[C@@H](NC(=O)OC(C)(C)C)C(C)(C)C)C(=O)NS(=O)(=O)C1CC1
|
| 96 |
+
COc1cc(N2CCN(CCCC(=O)N[C@H](C(=O)N3C[C@H](O)C[C@H]3C(=O)N[C@@H](C)c3ccc(-c4scnc4C)cc3)C(C)(C)C)CC2)ccc1Nc1ncc(Cl)c(Nc2ccccc2P(C)(C)=O)n1
|
| 97 |
+
C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](Oc2cc(-c3ccccc3)nc3cc(OC)ccc23)CN1C(=O)[C@@H](NC(=O)OC(C)(C)C)C(C)(C)C)C(=O)NNS(=O)(=O)c1ccccc1
|
| 98 |
+
CC[C@H](C)[C@H](NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CO)NC(=O)CCNC(=S)Nc1ccc(-c2c3ccc(=O)cc-3oc3cc(O)ccc23)c(C(=O)O)c1)C(=O)N[C@@H](CSCc1ccccc1)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N1CCC[C@H]1C(N)=O)[C@@H](C)O)C(C)C
|
| 99 |
+
CSCC[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](C)NC(=O)[C@H](CO)NC(=O)[C@H](C)N)[C@@H](C)O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CS)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CS)C(=O)NCC(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)O)C(C)C)C(C)C
|
| 100 |
+
CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NP(=O)(O)CCCCN1C(=O)c2ccccc2C1=O)C(=O)NCc1ccccc1
|
| 101 |
+
CC(C)C[C@H](NCC(N)=O)c1cc(F)ccc1N1CCN(C(=O)[C@@H](Cc2ccc(Cl)cc2Cl)N2CCCC2=O)CC1
|
demo/example_smiles_IDs.txt
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ID SMILES
|
| 2 |
+
CHEMBL3782097 CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)NO)NC(=O)OCc1ccccc1)C(C)C)[C@@H](O)CC(=O)NCCc1ccccc1
|
| 3 |
+
CHEMBL3819704 N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1cc(I)c(O)c(I)c1)C(N)=O
|
| 4 |
+
CHEMBL2368819 NC(=O)[C@@H]1C[C@H](NC(=O)C(F)(F)F)CN1C(=O)[C@H](N)CCC/N=C(\N)N[N+](=O)[O-]
|
| 5 |
+
CHEMBL3545807 CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CCCNC(=N)N)NC(=O)OCc1ccccc1)[C@@H](O)CC(=O)NC1CCCCC1
|
| 6 |
+
CHEMBL3302347 CC(C)C[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](N)Cc1ccccc1)C(=O)NCC(=O)N1CCC[C@H]1C(=O)NCC(=O)Nc1ccc(N(CCCl)CCCl)cc1
|
| 7 |
+
CHEMBL1184757 C[C@H](NC(=O)[C@@H](CO)NS(=O)(=O)c1ccccc1)C(=O)N[C@H]1CCCN(C(=N)N)C1O
|
| 8 |
+
CHEMBL2403897 C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](c2csc(-c3ccccc3)n2)CN1C(=O)[C@@H](NC(=O)OC1CCCC1)C(C)(C)C)C(=O)NS(=O)(=O)C1CC1
|
| 9 |
+
CHEMBL1229044 CC[C@H](C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCC[N+](C)(C)C)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](N)CCCC[N+](C)(C)C)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
|
| 10 |
+
CHEMBL2425403 CSCC[C@H](NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](C)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H](N)CC(C)C)[C@@H](C)O)C(=O)O
|
| 11 |
+
CHEMBL2425396 CSCC[C@H](NC(=O)[C@H](Cc1cnc[nH]1)NC(=O)[C@H](CO)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H](N)Cc1cnc[nH]1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)O
|
| 12 |
+
CHEMBL1181891 CC(C)(C)NC(=O)C1(C2CCCCC2)CCN(C(=O)[C@@H](Cc2ccc(F)cc2)NC(=O)[C@@H]2CNC3(CC3)CN2)CC1
|
| 13 |
+
CHEMBL1185696 CC(=O)O[C@H]1C(=O)[C@@]2(C)[C@H]([C@H](OC(=O)c3ccccc3)[C@]3(O)CC(OC(=O)[C@H](OC(=O)NCCNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc4ccccc4)NC(=O)[C@@H](C)N)C(NC(=O)c4ccccc4)c4ccccc4)C(C)=C1C3(C)C)[C@]1(OC(C)=O)CO[C@@H]1C[C@@H]2O
|
| 14 |
+
CHEMBL1189783 COC(=O)[C@@H](Cc1ccccc1)NC(=O)[C@H](NC(=O)CC(O)CC(CC(C)C)NC(=O)[C@@H](N)Cc1c[nH]cn1)C(C)C
|
| 15 |
+
CHEMBL1229047 CC[C@H](C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](N)CCCC[N+](C)(C)C)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C(C)C)C(O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCC[N+](C)(C)C)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C
|
| 16 |
+
CHEMBL418285 CC1OC(SCCCCCCNC(=O)[C@H](CCC(N)=O)NC(=O)[C@@H](C)N)C(O)C(O)C1O
|
| 17 |
+
CHEMBL3787168 CC1(C)N([O])C(c2ccc(OCC(=O)NCCCNC(=O)[C@H](Cc3ccc(O)cc3)NC(=O)[C@@H](N)CCCNC(=N)N)cc2)=[N+]([O-])C1(C)C
|
| 18 |
+
CHEMBL4302812 CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CSCCC[P+](C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CSCCC[P+](C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)CSCCC[P+](C)(C)C)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(N)=O
|
| 19 |
+
CHEMBL1195733 CCN(CC)CCC(=O)NC(C(=O)N[C@H](Cc1cn(C=O)c2ccccc12)C(=O)N[C@@H](Cc1ccccc1)C(=O)N(C)Cc1ccccc1)C(C)O
|
| 20 |
+
CHEMBL1179530 N[C@@H](Cc1cc2ccccc2[nH]1)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)NCCCC(=O)NCC(=O)NCCCCCCOP(=O)(O)Oc1ccccc1Cl
|
| 21 |
+
CHEMBL5029048 C[C@@H](N)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCCCN)C(=O)NCCCCC(=O)OCCNc1nc(NCCc2ccccc2)c2cnn(/C=C/c3ccccc3)c2n1
|
| 22 |
+
CHEMBL1199463 CCCC[PH](CCCC)(CCCC)Cc1ccc(NC(=O)C2Cc3ccccc3CN2C(=O)[C@@H](N)CCc2ccccc2)cc1
|
| 23 |
+
CHEMBL2103901 CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)CCCCN)C(=O)N[C@@H](CCCCN)C(=O)O
|
| 24 |
+
CHEMBL2165218 Cc1cc(C(=O)N[C@H](C(=O)N[C@@H](Cc2ccc(F)cc2)C(=O)N[C@@H](/C=C/C(=O)OCc2nc3cc(Cl)ccc3[nH]2)CCC(N)=O)C(C)C)no1
|
| 25 |
+
CHEMBL4300381 CCN(CC)c1ccc2c(-c3ccc(S(=O)(=O)NCCCC[C@H](NC(=O)Cc4csc(=N)n4C)C(=O)N[C@@H](Cc4cn(Cc5ccccc5)c[n+]4C)C(=O)NC4CCN(C)CC4)cc3S(=O)(=O)[O-])c3ccc(=[N+](CC)CC)cc-3oc2c1
|
| 26 |
+
CHEMBL3302723 CC(C)C[C@H](N)C(=O)NCC(=O)N1CCC[C@H]1C(=O)NCC(=O)Nc1ccc(N(CCCl)CCCl)cc1
|
| 27 |
+
CHEMBL4301870 CC1=CC(C)=[N+]2C1=Cc1ccc(CCC(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(N)=O)n1[B-]2(F)F
|
| 28 |
+
CHEMBL2304033 N=C(N)NCCC[C@@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](CC(=O)O)NC(=O)[C@@H](CO)NC(=O)[C@@H](N)CC(=O)O)C(=O)O
|
| 29 |
+
CHEMBL2364835 CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@H]2CCCN2C[C@H]1C(=O)N[C@@H]1CCOc2ccccc21)C1CCC(F)(F)CC1
|
| 30 |
+
CHEMBL1852804 COc1ccc(NC(=O)[C@@H]2Cc3ccc(OCC(=O)NO)cc3CN2C(=O)[C@H](C)N)cc1
|
| 31 |
+
CHEMBL3740745 COc1cccc(COc2ccc([C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCNC(=N)N)NC(=O)c3cccs3)C(N)=O)cc2)c1
|
| 32 |
+
CHEMBL5315308 CC(C)CC(N)C(=O)NCC(=O)NC[C@H](C)B1OC2CC3CC(C3(C)C)[C@@]2(C)O1
|
| 33 |
+
CHEMBL1188598 CCN(CC)CCNC(=O)c1ccc(C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C=O)CC(C)C)c(NNN2CCCC2)c1
|
| 34 |
+
CHEMBL1189941 CC(O)C(NC(=O)CCN)C(=O)N[C@H](Cc1cn(C=O)c2ccccc12)C(=O)N[C@@H](Cc1ccccc1)C(=O)N(C)Cc1ccccc1
|
| 35 |
+
CHEMBL1191337 CCCN(CC(=O)N[C@H](C=O)CCCN=C(N)N)C(=O)[C@H]1CCCCN1
|
| 36 |
+
CHEMBL3407793 CC(C)C[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(C)C)C(C)C)C(=O)N[C@@H](CO)C(=O)O
|
| 37 |
+
CHEMBL1193469 CC(C)(N)C(=O)N[C@H](CCCc1ccccc1)C(=O)N1CCC2(CC1)CC(O)c1ccccc12
|
| 38 |
+
CHEMBL3408302 CC[C@H](C)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](N)CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CSc1ccc2n1[B-](F)(F)[N+]1=CC=CC1=C2)C(=O)O
|
| 39 |
+
CHEMBL4597997 O=C(NCCCCC[C@H](NC(=O)[C@@H]1C[C@@H](N2CCCCC2)CN1C(=O)[C@@H](CC1CCCCC1)NC(=O)c1ccc2ccccc2c1)B(O)O)NC1CCCCC1
|
| 40 |
+
CHEMBL3410386 CC(=O)N[C@@H](CCCO/N=C/c1ccccc1)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCC(=O)O)C(N)=O
|
| 41 |
+
CHEMBL1196235 COc1ccc(CC(C)(NC(=O)[C@@H]2CCCN2C(=O)CCCc2ccc(O)cc2)C(=O)NCCCN)cc1OC
|
| 42 |
+
CHEMBL1181305 N=C(N)N1CCC(C(NS(=O)(=O)Cc2ccccc2)C(=O)NCC(=O)N[C@H]2CCCN(C(=N)N)C2O)CC1
|
| 43 |
+
CHEMBL3787701 CCC(=O)NCCOCCOCCNC(=O)/N=C(\N)NCCC[C@H](NC(=O)[C@@H](N)CC(=O)O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1cnc[nH]1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(=O)O)[C@@H](C)CC)C(C)C
|
| 44 |
+
CHEMBL1198214 N=C(N)c1ccc(CNC(=O)[C@@H]2CCCN2C(=O)[C@H](N)C2CCCCC2)cc1
|
| 45 |
+
CHEMBL3304520 CC(C)C[C@H](NC(=O)CNC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)NCC(N)=O
|
| 46 |
+
CHEMBL1179088 CCCCCCCC(=O)OC[C@H](NC(=O)C(C)(C)N)C(=O)N1CCC2(CC1)CN(S(C)(=O)=O)c1ccccc12
|
| 47 |
+
CHEMBL3794663 COc1ccc(NC(C)=O)cc1C(=O)NNC(=O)[C@H](CCCCN)NC(=O)CCOC[C@H]1OC(OCCCNC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)OC(C)(C)C)C(C)C)[C@H](O)[C@@H](O)[C@@H]1O
|
| 48 |
+
CHEMBL1852000 CSCC[C@H](NC(=O)[C@@H]1Cc2ccccc2CN1)C(=O)NO
|
| 49 |
+
CHEMBL1207289 CSCC[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(C)=O)C(=O)NCC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](COS(=O)(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(N)=O
|
| 50 |
+
CHEMBL525036 C=Cc1c(C)c2cc3nc(c4c5[nH]c(cc6nc(cc1[nH]2)C(C)=C6CC)c(C)c5C(=O)C4)[C@@H](CCC(=O)N[C@H](C(=O)N[C@@H](CO)C(=O)NCC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCCCNOC(=O)CCCN(C)c1ccc(/N=N/c2cccc4nc5ccc(N(CC)CC)cc5[n+](-c5ccccc5)c24)cc1)C(N)=O)[C@@H](C)O)[C@@H]3C
|
| 51 |
+
CHEMBL2361923 CC(=O)NC(Cc1ccc([N+](=O)[O-])cc1)C(=O)NCC(N)C(=O)c1ccccc1
|
| 52 |
+
CHEMBL3354497 CC(C)(C)[C@H](NC(=O)Cc1cc(Cl)cc(Cl)c1)C(=O)NCC(=O)NC/C=C/S(C)(=O)=O
|
| 53 |
+
CHEMBL1199003 CC(C)CCOc1ccc2ccccc2c1-c1c(OCC(=O)N[C@H](CCCCN)C(=O)N[C@H](CCCN)C(=O)N[C@@H](CC(C)C)C(=O)OCc2ccccc2)ccc2ccccc12
|
| 54 |
+
CHEMBL1183069 CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)c1[nH]c2ccc(OCCCCCN)cc2c1CCCCCCN)C(=O)N[C@@H](CC(C)C)C(=O)O
|
| 55 |
+
CHEMBL3946803 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C)C(C)C
|
| 56 |
+
CHEMBL3985737 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
|
| 57 |
+
CHEMBL3984334 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
|
| 58 |
+
CHEMBL284201 CC[C@H](C)[C@H](NC(=O)[C@@H](N)C(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N1CCC[C@H]1C(=O)N[C@@H](C)C(=O)NCC(=O)O
|
| 59 |
+
CHEMBL3890815 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C
|
| 60 |
+
CHEMBL3944455 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
|
| 61 |
+
CHEMBL3890020 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
|
| 62 |
+
CHEMBL3891294 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
|
| 63 |
+
CHEMBL2219891 COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)C[C@@](O)(C(=O)CO)C[C@@H]3O[C@H]1C[C@H]2[C@H](OCN2C(=O)OCc2ccc(NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc3ccccc3)NC(=O)[C@@H](C)N)cc2)[C@H](C)O1
|
| 64 |
+
CHEMBL3983321 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)CC(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
|
| 65 |
+
CHEMBL3914919 CC(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(N)=O)C(C)C)C(C)C
|
| 66 |
+
CHEMBL1178333 CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)c1[nH]c2c(OCCCCCN=C(N)N)cccc2c1CCCCCCN=C(N)N)C(=O)N[C@@H](CC(C)C)C(=O)O
|
| 67 |
+
CHEMBL1463226 CCN(CC(=O)NCC(=O)Nc1cccc(C)c1C)Cc1ccccc1
|
| 68 |
+
CHEMBL5085501 CCn1c(-c2nonc2N)nc2c(C#CC(C)(C)O)ncc(OCCCNCCCC(=O)NCCOCCOCCOCCC(=O)N[C@H](C(=O)N3C[C@H](O)C[C@H]3C(=O)NCc3ccc(-c4scnc4C)cc3)C(C)(C)C)c21
|
| 69 |
+
CHEMBL5286315 Cc1cc2c(s1)-n1c(C)nnc1[C@H](CC(=O)NCCOCCCOCC(=O)N[C@@H](C(=O)N1C[C@H](O)C[C@H]1C(=O)NCc1ccc(-c3scnc3C)cc1)C(C)(C)C)N=C2c1ccc(Cl)cc1
|
| 70 |
+
CHEMBL2368817 NC(=O)[C@@H]1C[C@H](NC(=O)[C@H](N)CCC/N=C(\N)N[N+](=O)[O-])CN1
|
| 71 |
+
CHEMBL5071325 CC(C)C[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)C[C@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1ccccc1)C(N)=O
|
| 72 |
+
CHEMBL5285634 CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCCNC(=N)N)NC(=O)CC(c1ccccc1)c1ccccc1)[C@@H](C)O)[C@@H](C)CC)C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](Cc1cn(CCN2CCCc3cc(/C=C/C4=C(Br)C(/C=C/c5cc6c7c(c5)CCCN7CCC6)=[O+][B-](F)(C(F)(F)F)O4)ccc32)nn1)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O)[C@@H](C)CC)[C@@H](C)CC
|
| 73 |
+
CHEMBL5185804 Cc1ncsc1-c1ccc(C2(NC(=O)[C@@H]3C[C@@H](O)CN3C(=O)[C@@H](NC(=O)[C@H]3CC4(C3)C[C@H](N3CCC(c5cnc(N6C7CCC6CN(c6cc(-c8ccccc8O)nnc6N)C7)nc5)CC3)C4)C(C)(C)C)CC2)cc1
|
| 74 |
+
CHEMBL5202298 Cc1cc(C)c(CNC(=O)c2cc(-c3ccc(N4CCN(C(=O)CCCCCn5cc(CCCCCC(=O)N[C@@H](C(=O)N6C[C@H](O)C[C@H]6C(=O)NCc6ccc(-c7scnc7C)cc6)C(C)(C)C)nn5)CC4)nc3)cc3c2cnn3C(C)C)c(=O)[nH]1
|
| 75 |
+
CHEMBL5090130 CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(C)=O)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(=O)O)[C@@H](C)CC
|
| 76 |
+
CHEMBL5091695 CC[C@H](C)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](C)NC(=O)[C@@H](NC(=O)[C@H](CNC(=O)c1cccc(S(=O)(=O)F)c1)NC(C)=O)[C@@H](C)CC)C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(N)=O
|
| 77 |
+
CHEMBL5090609 CC(C)[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](Cc1cccc(Cl)c1)NC(=O)C1CCN(C)CC1)C(=O)C(F)(F)F
|
| 78 |
+
CHEMBL5073350 COc1ccc([C@H](NC(=O)[C@H](Cc2cccc(Cl)c2)NC(=O)c2ccc(CNC(=O)OC(C)(C)C)c(Cl)c2)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
|
| 79 |
+
CHEMBL5089534 COc1ccc([C@H](NC(=O)[C@H](Cc2cccc(Cl)c2)NC(=O)c2cccc(OCC(=O)OC(C)(C)C)c2)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
|
| 80 |
+
CHEMBL5088663 COc1ccc([C@H](NC(=O)[C@H](C)NC(=O)C(c2ccc(Cl)cc2)C(C)C)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
|
| 81 |
+
CHEMBL5195766 CC(C)C[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CCCCN)NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CCCN1C(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](Cc1c[nH]cn1)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](N)CC(C)C)C(=O)O
|
| 82 |
+
CHEMBL5077064 CSCC[C@H](NC(=O)[C@H](C)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCCCN)C(C)C)[C@@H](C)O)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CS)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](CC(C)C)C(=O)O
|
| 83 |
+
CHEMBL1766929 CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](N)CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O
|
| 84 |
+
CHEMBL1564198 Cc1oc2c(c(C)cc3oc(=O)c(CC(=O)NCC(=O)NCC(=O)NCC(C)O)c(C)c32)c1C
|
| 85 |
+
CHEMBL5198087 CC[C@H](C)[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(N)=O)NC(C)=O)[C@@H](C)CC)[C@@H](C)CC)[C@@H](C)CC)[C@@H](C)CC)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(N)=O
|
| 86 |
+
CHEMBL5208458 Cc1cc(Cn2c(N3CC4(CNC4)C3)nc3c(N4CCN(CCCC(=O)NCCCC(=O)N[C@H](C(=O)N5C[C@H](O)C[C@H]5C(=O)N[C@@H](C)c5ccc(-c6scnc6C)cc5)C(C)(C)C)CC4)cc(Cl)cc32)cc(C)c1F
|
| 87 |
+
CHEMBL5075875 Cn1ccc(-c2cc(Cl)c(Cl)c3[nH]c4c(c23)CN(C(=O)CNC(=O)CN2CCNCC2)CC4)n1
|
| 88 |
+
CHEMBL1767019 CN(C)CCCCCNC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](N)Cc1c[nH]cn1
|
| 89 |
+
CHEMBL323044 CN(Cc1ccc2ccccc2c1)C(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]1CCCN1/C(S)=N/Cc1ccccc1Cl
|
| 90 |
+
CHEMBL63188 CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)CNC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)N[C@@H](CCCN=C(N)N)C(=O)O
|
| 91 |
+
CHEMBL3138731 CSCC[C@H](NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CO)NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@H](CCCN=C(N)N)C(=O)O
|
| 92 |
+
CHEMBL4596927 O=C(N[C@H](CC1CCCCC1)C(=O)N1C[C@H](N2CCCCC2)C[C@H]1C(=O)N[C@@H](CCCCN1CC2(CSC2)C1)B(O)O)c1ccc2ccccc2c1
|
| 93 |
+
CHEMBL1797525 N=C(N)NCCCC[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](Cc1ccccc1)NS(=O)(=O)Cc1ccccc1)C(=O)Cc1ccccc1
|
| 94 |
+
CHEMBL2068547 COC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)CCCCC(=O)NC[C@@H]1CCN2CC[C@@H](CO[Si](c3ccccc3)(c3ccccc3)C(C)(C)C)N=C2N1)[C@@H](C)O
|
| 95 |
+
CHEMBL414993 C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](Oc2cc(-c3ccccc3)nc3cc(OC)ccc23)CN1C(=O)[C@@H](NC(=O)OC(C)(C)C)C(C)(C)C)C(=O)NS(=O)(=O)C1CC1
|
| 96 |
+
CHEMBL5078877 COc1cc(N2CCN(CCCC(=O)N[C@H](C(=O)N3C[C@H](O)C[C@H]3C(=O)N[C@@H](C)c3ccc(-c4scnc4C)cc3)C(C)(C)C)CC2)ccc1Nc1ncc(Cl)c(Nc2ccccc2P(C)(C)=O)n1
|
| 97 |
+
CHEMBL414992 C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](Oc2cc(-c3ccccc3)nc3cc(OC)ccc23)CN1C(=O)[C@@H](NC(=O)OC(C)(C)C)C(C)(C)C)C(=O)NNS(=O)(=O)c1ccccc1
|
| 98 |
+
CHEMBL5281856 CC[C@H](C)[C@H](NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CO)NC(=O)CCNC(=S)Nc1ccc(-c2c3ccc(=O)cc-3oc3cc(O)ccc23)c(C(=O)O)c1)C(=O)N[C@@H](CSCc1ccccc1)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N1CCC[C@H]1C(N)=O)[C@@H](C)O)C(C)C
|
| 99 |
+
CHEMBL5094988 CSCC[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](C)NC(=O)[C@H](CO)NC(=O)[C@H](C)N)[C@@H](C)O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CS)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CS)C(=O)NCC(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)O)C(C)C)C(C)C
|
| 100 |
+
CHEMBL419395 CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NP(=O)(O)CCCCN1C(=O)c2ccccc2C1=O)C(=O)NCc1ccccc1
|
| 101 |
+
CHEMBL393789 CC(C)C[C@H](NCC(N)=O)c1cc(F)ccc1N1CCN(C(=O)[C@@H](Cc2ccc(Cl)cc2Cl)N2CCCC2=O)CC1
|
dictionary.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas==2.2.2
|
| 2 |
+
rdkit-pypi==2022.9.5
|
| 3 |
+
tqdm==4.67.1
|
| 4 |
+
argparse==1.4.0
|
| 5 |
+
matplotlib==3.8.0
|
| 6 |
+
gradio>=5.0.0
|
| 7 |
+
cairosvg
|
run_pipeline.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
import subprocess
|
| 4 |
+
import sys
|
| 5 |
+
import datetime
|
| 6 |
+
|
| 7 |
+
def run_pipeline(input_file, output_dir, monomerizer_args=None):
|
| 8 |
+
# Ensure output directory exists
|
| 9 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 10 |
+
|
| 11 |
+
# Step 1: Run monomerizer.py with its arguments
|
| 12 |
+
print(f"Running monomerizer.py... Input: {input_file}, Output: {output_dir}")
|
| 13 |
+
monomerizer_command = [sys.executable, "src/monomerizer.py", "--input_file", input_file, "--output_dir", output_dir]
|
| 14 |
+
if monomerizer_args:
|
| 15 |
+
monomerizer_command.extend(monomerizer_args)
|
| 16 |
+
subprocess.run(monomerizer_command, check=True)
|
| 17 |
+
|
| 18 |
+
# Step 2: Run standardizer.py with its arguments
|
| 19 |
+
print("Running standardizer.py...")
|
| 20 |
+
standardizer_command = [sys.executable, "src/standardizer.py", "--output_dir", output_dir]
|
| 21 |
+
subprocess.run(standardizer_command, check=True)
|
| 22 |
+
|
| 23 |
+
# Step 3: Run prepare_GPepT_data.py to process sequences
|
| 24 |
+
print("Running prepare_GPepT_data.py...")
|
| 25 |
+
prepare_gpept_data_command = [sys.executable, "src/prepare_GPepT_data.py", "--output_dir", output_dir]
|
| 26 |
+
subprocess.run(prepare_gpept_data_command, check=True)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
if __name__ == "__main__":
|
| 30 |
+
parser = argparse.ArgumentParser(description="Run a pipeline of programs sequentially.")
|
| 31 |
+
|
| 32 |
+
# Add arguments
|
| 33 |
+
parser.add_argument("--input_file", default="demo/example_smiles.txt", help="Input file for the pipeline")
|
| 34 |
+
parser.add_argument("--output_dir", default=f"output/{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}", help="Output directory")
|
| 35 |
+
parser.add_argument("--process_cyclic", action="store_true", help="Process cyclic compounds")
|
| 36 |
+
parser.add_argument("--min_amino_acids", type=int, help="Minimum number of amino acids required")
|
| 37 |
+
parser.add_argument("--batch_size", type=int, help="Batch size for processing")
|
| 38 |
+
parser.add_argument("--max_workers", type=int, help="Maximum number of workers for parallel processing")
|
| 39 |
+
parser.add_argument("-draw", action="store_true", help="Draw the molecules")
|
| 40 |
+
|
| 41 |
+
args = parser.parse_args()
|
| 42 |
+
|
| 43 |
+
# Prepare extra arguments for monomerizer.py
|
| 44 |
+
monomerizer_args = []
|
| 45 |
+
if args.process_cyclic:
|
| 46 |
+
monomerizer_args.append("-process_cyclic")
|
| 47 |
+
if args.min_amino_acids:
|
| 48 |
+
monomerizer_args.extend(["--min_amino_acids", int(args.min_amino_acids)])
|
| 49 |
+
if args.batch_size:
|
| 50 |
+
monomerizer_args.extend(["--batch_size", str(args.batch_size)])
|
| 51 |
+
if args.max_workers:
|
| 52 |
+
monomerizer_args.extend(["--max_workers", str(args.max_workers)])
|
| 53 |
+
if args.draw:
|
| 54 |
+
monomerizer_args.append("-draw")
|
| 55 |
+
|
| 56 |
+
# Run the pipeline
|
| 57 |
+
run_pipeline(args.input_file, args.output_dir, monomerizer_args=monomerizer_args)
|
src/analyse.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from rdkit import Chem
|
| 5 |
+
from rdkit.Chem import rdMolDescriptors, DataStructs, Descriptors
|
| 6 |
+
import os, sys, requests, tqdm, re, argparse
|
| 7 |
+
from collections import defaultdict
|
| 8 |
+
import xml.etree.ElementTree as ET
|
| 9 |
+
|
| 10 |
+
def add_canonical_smiles(df):
|
| 11 |
+
canonical_smiles_list = [
|
| 12 |
+
"C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N", # Trytophan (W)
|
| 13 |
+
"C(C[C@@H](C(=O)O)N)CNC(=N)N", # Arginine (R)
|
| 14 |
+
"C1=C(NC=N1)C[C@@H](C(=O)O)N", # Histidine (H)
|
| 15 |
+
"C1C[C@H](NC1)C(=O)O", # Proline (P)
|
| 16 |
+
"C(CCN)C[C@@H](C(=O)O)N", # Lysine (K)
|
| 17 |
+
"CSCC[C@@H](C(=O)O)N", # Methionine (M)
|
| 18 |
+
"C(CC(=O)N)[C@@H](C(=O)O)N", # Asparagine (N)
|
| 19 |
+
"C([C@@H](C(=O)O)N)C(=O)N", # Glutamine (Q)
|
| 20 |
+
"C(CC(=O)O)[C@@H](C(=O)O)N", # Glutamic acid (E)
|
| 21 |
+
"OC(=O)C[C@@H](C(=O)O)N", # Aspartic acid (D)
|
| 22 |
+
"C1=CC(=CC=C1C[C@@H](C(=O)O)N)O", # Tyrosine (Y)
|
| 23 |
+
"C1=CC=C(C=C1)C[C@@H](C(=O)O)N", # Phenylalanine (F)
|
| 24 |
+
"CC[C@H](C)[C@@H](C(=O)O)N", # Valine (V)
|
| 25 |
+
"CC(C)C[C@@H](C(=O)O)N", # Leucine (L)
|
| 26 |
+
"CC(C)[C@@H](C(=O)O)N", # Isoleucine (I)
|
| 27 |
+
"C[C@H]([C@@H](C(=O)O)N)O", # Threonine (T)
|
| 28 |
+
"C([C@@H](C(=O)O)N)S", # Cysteine (C)
|
| 29 |
+
"C([C@@H](C(=O)O)N)O", # Serine (S)
|
| 30 |
+
"C[C@@H](C(=O)O)N", # Alanine (A)
|
| 31 |
+
"C(C(=O)O)N" # Glycine (G)
|
| 32 |
+
]
|
| 33 |
+
one_letter_codes = ['W','R','H','P','K','M','N','Q','E','D','Y','F','V','L','I','T','C','S','A','G']
|
| 34 |
+
|
| 35 |
+
canonical_df = pd.DataFrame({
|
| 36 |
+
'ID': one_letter_codes,
|
| 37 |
+
'SMILES': canonical_smiles_list,
|
| 38 |
+
'CANONICAL': ['True'] * len(canonical_smiles_list),
|
| 39 |
+
'TERMINAL': ['NotTer'] * len(canonical_smiles_list),
|
| 40 |
+
'ROMol': [Chem.MolFromSmiles(smi) for smi in canonical_smiles_list]
|
| 41 |
+
})
|
| 42 |
+
|
| 43 |
+
return pd.concat([df, canonical_df], ignore_index=True)
|
| 44 |
+
|
| 45 |
+
def cal_tanimoto(mol):
|
| 46 |
+
l_glycine = Chem.MolFromSmiles("C(C(=O)O)N")
|
| 47 |
+
fp1 = rdMolDescriptors.GetMorganFingerprint(mol, 2)
|
| 48 |
+
fp2 = rdMolDescriptors.GetMorganFingerprint(l_glycine, 2)
|
| 49 |
+
return DataStructs.TanimotoSimilarity(fp1, fp2)
|
| 50 |
+
|
| 51 |
+
def fetch_pubchem_name(smiles):
|
| 52 |
+
try:
|
| 53 |
+
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/property/Title/JSON"
|
| 54 |
+
response = requests.get(url)
|
| 55 |
+
response.raise_for_status()
|
| 56 |
+
data = response.json()
|
| 57 |
+
return data['PropertyTable']['Properties'][0].get('Title', 'NULL')
|
| 58 |
+
except (requests.exceptions.RequestException, KeyError, IndexError):
|
| 59 |
+
return "NULL"
|
| 60 |
+
|
| 61 |
+
def fetch_chembl_similarity(smiles, similarity_threshold=100):
|
| 62 |
+
try:
|
| 63 |
+
url = f"https://www.ebi.ac.uk/chembl/api/data/similarity/{smiles}/{similarity_threshold}"
|
| 64 |
+
response = requests.get(url)
|
| 65 |
+
response.raise_for_status()
|
| 66 |
+
root = ET.fromstring(response.content)
|
| 67 |
+
chembl_ids = [m.find('.//molecule_chembl_id').text for m in root.findall('.//molecule') if m.find('.//molecule_chembl_id') is not None]
|
| 68 |
+
return chembl_ids if chembl_ids else ["NULL"]
|
| 69 |
+
except requests.exceptions.RequestException:
|
| 70 |
+
return ["NULL"]
|
| 71 |
+
|
| 72 |
+
def fetch_names(smiles):
|
| 73 |
+
pubchem_name = fetch_pubchem_name(smiles)
|
| 74 |
+
chembl_names = fetch_chembl_similarity(smiles)
|
| 75 |
+
return pubchem_name, ",".join(chembl_names)
|
| 76 |
+
|
| 77 |
+
def fetch_rdkit_properties(smiles):
|
| 78 |
+
try:
|
| 79 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 80 |
+
if mol is None:
|
| 81 |
+
return ["NULL"] * 7
|
| 82 |
+
weight = Descriptors.ExactMolWt(mol)
|
| 83 |
+
clogp = Descriptors.MolLogP(mol)
|
| 84 |
+
tpsa = Descriptors.TPSA(mol)
|
| 85 |
+
charge = Chem.GetFormalCharge(mol)
|
| 86 |
+
rotatable_bonds = Descriptors.NumRotatableBonds(mol)
|
| 87 |
+
h_donors = Descriptors.NumHDonors(mol)
|
| 88 |
+
h_acceptors = Descriptors.NumHAcceptors(mol)
|
| 89 |
+
return [weight, clogp, tpsa, charge, rotatable_bonds, h_donors, h_acceptors]
|
| 90 |
+
except Exception:
|
| 91 |
+
return ["NULL"] * 7
|
| 92 |
+
|
| 93 |
+
def count_monomers(mols_df):
|
| 94 |
+
monomers_dict = defaultdict(int)
|
| 95 |
+
for sequence in mols_df['SEQUENCE']:
|
| 96 |
+
if isinstance(sequence, str) and len(sequence) > 0:
|
| 97 |
+
tokens = re.findall('[A-Z][^A-Z]*', sequence)
|
| 98 |
+
for token in tokens:
|
| 99 |
+
monomers_dict[token] += 1
|
| 100 |
+
return monomers_dict
|
| 101 |
+
|
| 102 |
+
def main():
|
| 103 |
+
parser = argparse.ArgumentParser(description='Analyse non-natural amino acids (NNAA) from PubChem.')
|
| 104 |
+
parser.add_argument('--input_dir', help='Input directory containing the monomer data.', default='data/tmp')
|
| 105 |
+
parser.add_argument('--mols_file', help='File name relative to input_dir.', default='standard/sequences_standardized.txt')
|
| 106 |
+
parser.add_argument('-fetch_names', help='Fetch names from PubChem and ChEMBL.', action='store_true')
|
| 107 |
+
parser.add_argument('--target_type', help='Type of target: ncAAs or peptides?', default='ncAAs')
|
| 108 |
+
parser.add_argument('--output_file', help='Output CSV file name.', default='analysis.csv')
|
| 109 |
+
args = parser.parse_args()
|
| 110 |
+
|
| 111 |
+
mols_path = args.mols_file
|
| 112 |
+
output_path = os.path.join(args.input_dir, args.output_file)
|
| 113 |
+
|
| 114 |
+
df = pd.read_csv(mols_path, sep='\t')
|
| 115 |
+
df = df.dropna(subset=['SMILES']).drop_duplicates(subset=['SMILES'])
|
| 116 |
+
df['ROMol'] = df['SMILES'].apply(Chem.MolFromSmiles)
|
| 117 |
+
|
| 118 |
+
if args.fetch_names:
|
| 119 |
+
df[['PUBCHEM_NAME', 'CHEMBL_NAMES']] = df['SMILES'].apply(fetch_names).tolist()
|
| 120 |
+
|
| 121 |
+
df['Tanimoto_to_Glycine'] = df['ROMol'].apply(cal_tanimoto)
|
| 122 |
+
df[['MolWt', 'LogP', 'TPSA', 'FormalCharge', 'RotatableBonds', 'HydrogenDonors', 'HydrogenAcceptors']] = df['SMILES'].apply(fetch_rdkit_properties).tolist()
|
| 123 |
+
|
| 124 |
+
df.to_csv(output_path, index=False)
|
| 125 |
+
print(f"Processing completed. Results saved to {output_path}")
|
| 126 |
+
|
| 127 |
+
if __name__ == "__main__":
|
| 128 |
+
main()
|
src/demonomerizer.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
+
import re, ast
|
| 4 |
+
from rdkit import Chem
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
import argparse
|
| 8 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
# Parse the input arguments
|
| 12 |
+
parser = argparse.ArgumentParser(description="Preprocess the generated sequences file")
|
| 13 |
+
parser.add_argument("--sequence_file", type=str, help="Path to the generated sequences file", default="sequences_generated.txt")
|
| 14 |
+
parser.add_argument("--NNAA_file", type=str, help="Path to the NNAA file", default="dictionary.txt")
|
| 15 |
+
parser.add_argument("--batch_size", type=int, help="Batch size for processing sequences", default=8)
|
| 16 |
+
parser.add_argument("--output_dir", type=str, help="Output directory", default="output")
|
| 17 |
+
parser.add_argument("--demonomerized_file", type=str, help="Output demonomerized file name", default="demonomerized.txt")
|
| 18 |
+
|
| 19 |
+
args = parser.parse_args()
|
| 20 |
+
|
| 21 |
+
valid_backbone = Chem.MolFromSmarts("[NH,NH2]CC(=O)")
|
| 22 |
+
valid_backbone_OH = Chem.MolFromSmarts("[NH,NH2]CC(=O)O")
|
| 23 |
+
peptide_bond_mol = Chem.MolFromSmarts("[N,n][C,c]C(=O)[*!O]") # [*!O] ensures it does not match AAter
|
| 24 |
+
edge_C = 2
|
| 25 |
+
edge_N = 0
|
| 26 |
+
edge_O = 4
|
| 27 |
+
|
| 28 |
+
name_smi_dict = {
|
| 29 |
+
# isomeric SMILES from pubchem. eg https://pubchem.ncbi.nlm.nih.gov/compound/Alanine except for Asp (from https://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=3309) and Arg (from https://en.wikipedia.org/wiki/Arginine)
|
| 30 |
+
"Wter": "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N",
|
| 31 |
+
"W": "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O))N",
|
| 32 |
+
"Rter": "C(C[C@@H](C(=O)O)N)CNC(=N)N",
|
| 33 |
+
"R": "C(C[C@@H](C(=O))N)CNC(=N)N",
|
| 34 |
+
"Hter": "C1=C(NC=N1)C[C@@H](C(=O)O)N",
|
| 35 |
+
"H": "C1=C(NC=N1)C[C@@H](C(=O))N",
|
| 36 |
+
"Pter": "C1C[C@H](NC1)C(=O)O",
|
| 37 |
+
"P": "C1C[C@H](NC1)C(=O)",
|
| 38 |
+
"Kter": "C(CCN)C[C@@H](C(=O)O)N",
|
| 39 |
+
"K": "C(CCN)C[C@@H](C(=O))N",
|
| 40 |
+
"Mter": "CSCC[C@@H](C(=O)O)N",
|
| 41 |
+
"M": "CSCC[C@@H](C(=O))N",
|
| 42 |
+
"Qter": "C(CC(=O)N)[C@@H](C(=O)O)N",
|
| 43 |
+
"Q": "C(CC(=O)N)[C@@H](C(=O))N",
|
| 44 |
+
"Nter": "C([C@@H](C(=O)O)N)C(=O)N",
|
| 45 |
+
"N": "C([C@@H](C(=O))N)C(=O)N",
|
| 46 |
+
"Eter": "C(CC(=O)O)[C@@H](C(=O)O)N",
|
| 47 |
+
"E": "C(CC(=O)O)[C@@H](C(=O))N",
|
| 48 |
+
"Dter": "OC(=O)C[C@@H](C(=O)O)N",
|
| 49 |
+
"D": "OC(=O)C[C@@H](C(=O))N",
|
| 50 |
+
"Yter": "C1=CC(=CC=C1C[C@@H](C(=O)O)N)O",
|
| 51 |
+
"Y": "C1=CC(=CC=C1C[C@@H](C(=O))N)O",
|
| 52 |
+
"Fter": "C1=CC=C(C=C1)C[C@@H](C(=O)O)N",
|
| 53 |
+
"F": "C1=CC=C(C=C1)C[C@@H](C(=O))N",
|
| 54 |
+
"Iter": "CC[C@H](C)[C@@H](C(=O)O)N", # TODO add correct hydroxyl oxygen for every AA terminal
|
| 55 |
+
"I": "CC[C@H](C)[C@@H](C(=O))N",
|
| 56 |
+
"Lter": "CC(C)C[C@@H](C(=O)O)N",
|
| 57 |
+
"L": "CC(C)C[C@@H](C(=O))N",
|
| 58 |
+
"Vter": "CC(C)[C@@H](C(=O)O)N",
|
| 59 |
+
"V": "CC(C)[C@@H](C(=O))N",
|
| 60 |
+
"Tter": "C[C@H]([C@@H](C(=O)O)N)O",
|
| 61 |
+
"T": "C[C@H]([C@@H](C(=O))N)O",
|
| 62 |
+
"Cter": "C([C@@H](C(=O)O)N)S",
|
| 63 |
+
"C": "C([C@@H](C(=O))N)S",
|
| 64 |
+
"Ster": "C([C@@H](C(=O)O)N)O",
|
| 65 |
+
"S": "C([C@@H](C(=O))N)O",
|
| 66 |
+
"Ater": "C[C@@H](C(=O)O)N",
|
| 67 |
+
"A": "C[C@@H](C(=O))N",
|
| 68 |
+
"Gter": "C(C(=O)O)N",
|
| 69 |
+
"G": "C(C(=O))N",
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
def mark_edge(amino, pattern, edge_position):
|
| 73 |
+
matched_indices = amino.GetSubstructMatch(pattern)
|
| 74 |
+
edge_position = matched_indices[edge_position]
|
| 75 |
+
edge_atom = amino.GetAtomWithIdx(edge_position)
|
| 76 |
+
edge_atom.SetProp("atomNote", "edge")
|
| 77 |
+
return edge_atom
|
| 78 |
+
|
| 79 |
+
def mark_edge_NNAA(NNAA, bond_sites):
|
| 80 |
+
try:
|
| 81 |
+
for i in bond_sites:
|
| 82 |
+
integer = int(i)
|
| 83 |
+
atom = NNAA.GetAtomWithIdx(integer)
|
| 84 |
+
atom.SetProp("atomNote", "edge")
|
| 85 |
+
except:
|
| 86 |
+
print("No bond sites")
|
| 87 |
+
pass
|
| 88 |
+
|
| 89 |
+
def mark_bond_site(mol, index, symbol):
|
| 90 |
+
for atom in mol.GetAtoms():
|
| 91 |
+
if atom.HasProp("atomNote") and atom.GetSymbol() == symbol:
|
| 92 |
+
atom.SetProp("atomNote", str(index))
|
| 93 |
+
|
| 94 |
+
def clear_props(atom1, atom2):
|
| 95 |
+
atom1.ClearProp("atomNote")
|
| 96 |
+
atom2.ClearProp("atomNote")
|
| 97 |
+
|
| 98 |
+
def get_amino_mol(amino_name, name_smi_dict, NNAA_file):
|
| 99 |
+
for aa_name, aa_smi in name_smi_dict.items():
|
| 100 |
+
if aa_name == amino_name:
|
| 101 |
+
amino_mol = Chem.MolFromSmiles(aa_smi)
|
| 102 |
+
try:
|
| 103 |
+
mark_edge(amino_mol, valid_backbone, edge_N)
|
| 104 |
+
mark_edge(amino_mol, valid_backbone, edge_C)
|
| 105 |
+
except:
|
| 106 |
+
for index, row in NNAA_file.iterrows():
|
| 107 |
+
name = row["ID"]
|
| 108 |
+
if name == amino_name:
|
| 109 |
+
bond_info = ast.literal_eval(row["Bond sites"])
|
| 110 |
+
smiles_rootedAtAtom0 = bond_info[0]
|
| 111 |
+
bond_sites = bond_info[1:]
|
| 112 |
+
amino_mol = Chem.MolFromSmiles(smiles_rootedAtAtom0)
|
| 113 |
+
mark_edge_NNAA(amino_mol, bond_sites)
|
| 114 |
+
return amino_mol
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def process_batch(batch_df):
|
| 118 |
+
results = []
|
| 119 |
+
for index, row in batch_df.iterrows():
|
| 120 |
+
result_index, result_smiles = process_row(index, row)
|
| 121 |
+
results.append((result_index, result_smiles))
|
| 122 |
+
return results
|
| 123 |
+
|
| 124 |
+
def process_row(index, row):
|
| 125 |
+
if "SMILES" not in row or type(row["SMILES"]) == float or len(row["SMILES"]) == 0:
|
| 126 |
+
seq = row["SEQUENCE"]
|
| 127 |
+
split_seq = regex.findall(seq)
|
| 128 |
+
ordered_aminos = []
|
| 129 |
+
|
| 130 |
+
try:
|
| 131 |
+
for alphabet in split_seq:
|
| 132 |
+
amino_mol = get_amino_mol(alphabet, name_smi_dict, NNAA_file)
|
| 133 |
+
ordered_aminos.append(amino_mol)
|
| 134 |
+
|
| 135 |
+
# Replace the last amino with the terminal amino
|
| 136 |
+
amino_ter = split_seq[-1]
|
| 137 |
+
if not "ter" in amino_ter and not amino_ter.startswith("Z"):
|
| 138 |
+
amino_ter = f"{amino_ter}ter"
|
| 139 |
+
last_mol = get_amino_mol(amino_ter, name_smi_dict, NNAA_file)
|
| 140 |
+
ordered_aminos[-1] = last_mol
|
| 141 |
+
|
| 142 |
+
combined = ordered_aminos[0]
|
| 143 |
+
for i in range(len(ordered_aminos)-1):
|
| 144 |
+
mark_bond_site(combined, i, "C")
|
| 145 |
+
next_amino = ordered_aminos[i+1]
|
| 146 |
+
mark_bond_site(next_amino, i+1, "N")
|
| 147 |
+
combined = Chem.CombineMols(combined, next_amino)
|
| 148 |
+
rwmol = Chem.RWMol(combined)
|
| 149 |
+
for atom1 in rwmol.GetAtoms():
|
| 150 |
+
if atom1.HasProp("atomNote") and atom1.GetProp("atomNote") == f"{i}":
|
| 151 |
+
for atom2 in rwmol.GetAtoms():
|
| 152 |
+
if atom2.HasProp("atomNote") and atom2.GetProp("atomNote") == f"{i+1}":
|
| 153 |
+
rwmol.AddBond(atom1.GetIdx(), atom2.GetIdx(), Chem.BondType.SINGLE)
|
| 154 |
+
clear_props(atom1, atom2)
|
| 155 |
+
if len(rwmol.GetSubstructMatches(peptide_bond_mol)) == i+1:
|
| 156 |
+
combined = rwmol.GetMol()
|
| 157 |
+
break
|
| 158 |
+
|
| 159 |
+
result = Chem.MolToSmiles(combined, isomericSmiles=True, rootedAtAtom=0, canonical=True)
|
| 160 |
+
if '.' in result:
|
| 161 |
+
return index, None # Indicates unbound atoms
|
| 162 |
+
return index, result
|
| 163 |
+
|
| 164 |
+
except Exception as e:
|
| 165 |
+
print(f"Error in sequence: {seq}")
|
| 166 |
+
return index, None
|
| 167 |
+
|
| 168 |
+
return index, row.get("SMILES") # Return the existing SMILES if present
|
| 169 |
+
|
| 170 |
+
NNAA_file = pd.read_csv(args.NNAA_file, sep="\t")
|
| 171 |
+
for index, row in NNAA_file.iterrows():
|
| 172 |
+
smiles = row["SMILES"]
|
| 173 |
+
name = row["ID"]
|
| 174 |
+
NNAA = Chem.MolFromSmiles(smiles)
|
| 175 |
+
if NNAA.HasSubstructMatch(valid_backbone_OH):
|
| 176 |
+
rwmol = Chem.RWMol(NNAA)
|
| 177 |
+
OH_i = NNAA.GetSubstructMatch(valid_backbone_OH)[edge_O]
|
| 178 |
+
rwmol.RemoveAtom(OH_i)
|
| 179 |
+
noOH_smiles = Chem.MolToSmiles(rwmol)
|
| 180 |
+
name_smi_dict[name] = noOH_smiles
|
| 181 |
+
if not name.startswith("Z"):
|
| 182 |
+
name = f"{name}ter"
|
| 183 |
+
name_smi_dict[name] = smiles
|
| 184 |
+
|
| 185 |
+
tokenizer = r"X\d+|Z\d+|[A-WY]"
|
| 186 |
+
regex = re.compile(tokenizer)
|
| 187 |
+
|
| 188 |
+
df = pd.read_csv(args.sequence_file, sep="\t")
|
| 189 |
+
|
| 190 |
+
# add a column for SMILES
|
| 191 |
+
df["SMILES"] = ""
|
| 192 |
+
|
| 193 |
+
# Process in batches
|
| 194 |
+
batch_size = args.batch_size
|
| 195 |
+
batches = [df[i:i + batch_size] for i in range(0, df.shape[0], batch_size)]
|
| 196 |
+
|
| 197 |
+
# Process batches in parallel
|
| 198 |
+
with ThreadPoolExecutor() as executor:
|
| 199 |
+
futures = {executor.submit(process_batch, batch): batch for batch in batches}
|
| 200 |
+
for future in tqdm(as_completed(futures), total=len(futures)):
|
| 201 |
+
results = future.result()
|
| 202 |
+
for index, smiles in results:
|
| 203 |
+
if smiles:
|
| 204 |
+
df.at[index, "SMILES"] = smiles
|
| 205 |
+
|
| 206 |
+
# ✅ Use output_dir in your logic
|
| 207 |
+
os.makedirs(args.output_dir, exist_ok=True)
|
| 208 |
+
output_file = os.path.join(args.output_dir, args.demonomerized_file)
|
| 209 |
+
|
| 210 |
+
# Assuming `df` is your final DataFrame
|
| 211 |
+
df.to_csv(output_file, sep="\t", index=False)
|
src/draw.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import matplotlib.pyplot as plt
|
| 3 |
+
from matplotlib.colors import ListedColormap
|
| 4 |
+
from collections import defaultdict
|
| 5 |
+
from rdkit.Chem.Draw import rdMolDraw2D
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
class MoleculeDrawer:
|
| 9 |
+
def __init__(self, output_dir="output/tmp"):
|
| 10 |
+
self.output_dir = os.path.join(output_dir, "raw/images")
|
| 11 |
+
os.makedirs(self.output_dir, exist_ok=True)
|
| 12 |
+
self.aa2color_dict = {
|
| 13 |
+
"Asp": (0.902, 0.039, 0.039), "Glu": (0.961, 0.1, 0.537), "Arg": (0.078, 0.353, 1), "Lys": (0.42, 0.353, 1),
|
| 14 |
+
"His": (0.51, 0.51, 0.824), "Tyr": (0.196, 0.196, 0.667), "Phe": (0.341, 0.196, 0.667), "Trp": (0.706, 0.353, 0.706),
|
| 15 |
+
"Asn": (0, 0.863, 0.863), "Gln": (0.5, 0.82, 0.863), "Met": (0.902, 0.902, 0), "Cys": (0.722, 0.902, 0),
|
| 16 |
+
"Ser": (0.98, 0.588, 0), "Thr": (0, 0.612, 0.412), "Gly": (0.98, 0.922, 0.922), "Ala": (0.784, 0.784, 0.639),
|
| 17 |
+
"Val": (0.059, 0.51, 0.059), "Leu": (0.29, 0.51, 0.059), "Ile": (0.29, 0.51, 0.471), "Pro": (1, 0.588, 0.51)
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
def sort_atom_highlights(self, mol):
|
| 21 |
+
atom_highlights = defaultdict(list)
|
| 22 |
+
for atom_idx in range(mol.GetNumAtoms()):
|
| 23 |
+
labelled_atom = mol.GetAtomWithIdx(atom_idx)
|
| 24 |
+
AA_label = labelled_atom.GetProp("AA")
|
| 25 |
+
if self.label_belongs_to_AA(AA_label):
|
| 26 |
+
three_letter_label = AA_label[:3]
|
| 27 |
+
atom_highlights[atom_idx].append(self.aa2color_dict[three_letter_label])
|
| 28 |
+
|
| 29 |
+
# Convert defaultdict to dict of lists
|
| 30 |
+
return {k: list(v) for k, v in atom_highlights.items()}
|
| 31 |
+
|
| 32 |
+
def create_colormap(self):
|
| 33 |
+
legend_data = [(aa[:3], color) for aa, color in self.aa2color_dict.items() if aa != "Unk"]
|
| 34 |
+
fig, ax = plt.subplots(figsize=(1, 1))
|
| 35 |
+
cmap = ListedColormap([color for _, color in legend_data])
|
| 36 |
+
cax = ax.matshow(np.arange(len(legend_data)).reshape(1, -1), cmap=cmap)
|
| 37 |
+
cbar = fig.colorbar(cax, ticks=np.arange(len(legend_data)), aspect=5)
|
| 38 |
+
cbar.set_ticklabels([label for label, _ in legend_data])
|
| 39 |
+
cbar.ax.tick_params(labelsize=3)
|
| 40 |
+
ax.axis("off")
|
| 41 |
+
plt.savefig(os.path.join(self.output_dir, "colormap.png"), bbox_inches="tight", dpi=300)
|
| 42 |
+
plt.close()
|
| 43 |
+
|
| 44 |
+
def draw_input_mol(self, mol, mol_index, seq, bond_highlights):
|
| 45 |
+
atom_highlights = self.sort_atom_highlights(mol)
|
| 46 |
+
|
| 47 |
+
# Ensure bond_highlights is a dict of lists
|
| 48 |
+
bond_highlights = {k: list(v) for k, v in bond_highlights.items()} if bond_highlights else {}
|
| 49 |
+
|
| 50 |
+
mol_name = f"mol_{mol_index}"
|
| 51 |
+
legend = f'{mol_name}\nseq: {seq}\n{"8< = peptide bond"}\nAA_NAME:SEEN_COUNT:SEQUENCE_POSITION\n'
|
| 52 |
+
|
| 53 |
+
self.draw_mol(mol, atom_highlights, bond_highlights, legend, mol_name)
|
| 54 |
+
self.create_colormap()
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def draw_mol(self, mol, atom_highlights, bond_highlights, legend, mol_name):
|
| 58 |
+
view = rdMolDraw2D.MolDraw2DSVG(600, 300)
|
| 59 |
+
view.drawOptions().useBWAtomPalette()
|
| 60 |
+
view.DrawMoleculeWithHighlights(mol, legend, dict(atom_highlights), dict(bond_highlights), {}, {})
|
| 61 |
+
view.FinishDrawing()
|
| 62 |
+
with open(os.path.join(self.output_dir, f"{mol_name}.svg"), "w") as f:
|
| 63 |
+
f.write(view.GetDrawingText())
|
| 64 |
+
|
| 65 |
+
def label_belongs_to_AA(self, label):
|
| 66 |
+
shorter_label = label[:3]
|
| 67 |
+
return shorter_label != "Unk" and not label.startswith("X")
|
src/monomer_analyzer.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
src/monomerizer.py
ADDED
|
@@ -0,0 +1,882 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
+
# This script takes a isomeric SMILES file as input and outputs a seq (like fasta) file with the corresponding amino acid sequence.
|
| 4 |
+
# The script also outputs a isomeric SMILES file with the NNAA (non-natural amino acid) labeled as "X".
|
| 5 |
+
# Any compound connected to a valid backbone is considered as individual amino acid.
|
| 6 |
+
# The NNAAs that do not possess a valid backbone "[NH,NH2]CC(=O)O" required to continuously form peptide bonds, are considered as terminal modifications, and are named as "X0ter", "X1ter", etc.
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
from rdkit import Chem
|
| 10 |
+
from rdkit.Chem import RegistrationHash
|
| 11 |
+
from rdkit.Chem.RegistrationHash import HashLayer
|
| 12 |
+
from collections import deque
|
| 13 |
+
import argparse
|
| 14 |
+
from tqdm import tqdm
|
| 15 |
+
import pandas as pd
|
| 16 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 17 |
+
import multiprocessing as mp
|
| 18 |
+
from draw import MoleculeDrawer
|
| 19 |
+
from collections import defaultdict
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def parse_arguments():
|
| 23 |
+
parser = argparse.ArgumentParser(description="Process SMILES files and generate amino acid sequences.")
|
| 24 |
+
parser.add_argument("--input_file", default="demo/example_smiles.txt", help="Input SMILES file")
|
| 25 |
+
parser.add_argument("-process_cyclic", action="store_true", help="Process cyclic peptides")
|
| 26 |
+
parser.add_argument("--min_amino_acids", type=int, default=3, help="Minimum number of amino acids")
|
| 27 |
+
parser.add_argument("--batch_size", type=int, default=100, help="Batch size")
|
| 28 |
+
parser.add_argument("--output_dir", default="output/tmp", help="Output directory")
|
| 29 |
+
parser.add_argument("--max_workers", type=int, default=mp.cpu_count(), help="Maximum number of workers for parallel processing")
|
| 30 |
+
parser.add_argument("-draw", action="store_true", help="Draw molecules")
|
| 31 |
+
return parser.parse_args()
|
| 32 |
+
|
| 33 |
+
name_smi_dict = {
|
| 34 |
+
# isomeric SMILES from pubchem. eg https://pubchem.ncbi.nlm.nih.gov/compound/Alanine except for Asp (from https://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=3309) and Arg (from https://en.wikipedia.org/wiki/Arginine)
|
| 35 |
+
"TrpTer": "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N",
|
| 36 |
+
"Trp": "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O))N",
|
| 37 |
+
"ArgTer": "C(C[C@@H](C(=O)O)N)CNC(=N)N",
|
| 38 |
+
"Arg Ter": "NC(N)=NCCC[C@H](N)C(=O)O",
|
| 39 |
+
"Arg": "C(C[C@@H](C(=O))N)CNC(=N)N",
|
| 40 |
+
"Arg2": "NC(N)=NCCC[C@H](N)C(=O)",
|
| 41 |
+
"HisTer": "C1=C(NC=N1)C[C@@H](C(=O)O)N",
|
| 42 |
+
"His": "C1=C(NC=N1)C[C@@H](C(=O))N",
|
| 43 |
+
"ProTer": "C1C[C@H](NC1)C(=O)O",
|
| 44 |
+
"Pro": "C1C[C@H](NC1)C(=O)",
|
| 45 |
+
"LysTer": "C(CCN)C[C@@H](C(=O)O)N",
|
| 46 |
+
"Lys": "C(CCN)C[C@@H](C(=O))N",
|
| 47 |
+
"MetTer": "CSCC[C@@H](C(=O)O)N",
|
| 48 |
+
"Met": "CSCC[C@@H](C(=O))N",
|
| 49 |
+
"GlnTer": "C(CC(=O)N)[C@@H](C(=O)O)N",
|
| 50 |
+
"Gln": "C(CC(=O)N)[C@@H](C(=O))N",
|
| 51 |
+
"AsnTer": "C([C@@H](C(=O)O)N)C(=O)N",
|
| 52 |
+
"Asn": "C([C@@H](C(=O))N)C(=O)N",
|
| 53 |
+
"GluTer": "C(CC(=O)O)[C@@H](C(=O)O)N",
|
| 54 |
+
"Glu": "C(CC(=O)O)[C@@H](C(=O))N",
|
| 55 |
+
"AspTer": "OC(=O)C[C@@H](C(=O)O)N",
|
| 56 |
+
"Asp": "OC(=O)C[C@@H](C(=O))N",
|
| 57 |
+
"TyrTer": "C1=CC(=CC=C1C[C@@H](C(=O)O)N)O",
|
| 58 |
+
"Tyr": "C1=CC(=CC=C1C[C@@H](C(=O))N)O",
|
| 59 |
+
"PheTer": "C1=CC=C(C=C1)C[C@@H](C(=O)O)N",
|
| 60 |
+
"Phe": "C1=CC=C(C=C1)C[C@@H](C(=O))N",
|
| 61 |
+
"IleTer": "CC[C@H](C)[C@@H](C(=O)O)N", # TODO add correct hydroxyl oxygen for every AA terminal
|
| 62 |
+
"Ile": "CC[C@H](C)[C@@H](C(=O))N",
|
| 63 |
+
"LeuTer": "CC(C)C[C@@H](C(=O)O)N",
|
| 64 |
+
"Leu": "CC(C)C[C@@H](C(=O))N",
|
| 65 |
+
"ValTer": "CC(C)[C@@H](C(=O)O)N",
|
| 66 |
+
"Val": "CC(C)[C@@H](C(=O))N",
|
| 67 |
+
"ThrTer": "C[C@H]([C@@H](C(=O)O)N)O",
|
| 68 |
+
"Thr": "C[C@H]([C@@H](C(=O))N)O",
|
| 69 |
+
"CysTer": "C([C@@H](C(=O)O)N)S",
|
| 70 |
+
"Cys": "C([C@@H](C(=O))N)S",
|
| 71 |
+
"SerTer": "C([C@@H](C(=O)O)N)O",
|
| 72 |
+
"Ser": "C([C@@H](C(=O))N)O",
|
| 73 |
+
"AlaTer": "C[C@@H](C(=O)O)N",
|
| 74 |
+
# FBR: I wonder if we should have a SMILES for AlaStart
|
| 75 |
+
"Ala": "C[C@@H](C(=O))N",
|
| 76 |
+
# Saturated the carbon
|
| 77 |
+
"GlyTer": "C(C(=O)O)N",
|
| 78 |
+
"Gly": "C(C(=O))N",
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
smi2mol = {}
|
| 82 |
+
for aa_name, aa_smi in name_smi_dict.items():
|
| 83 |
+
smi2mol[aa_name] = Chem.MolFromSmiles(aa_smi)
|
| 84 |
+
|
| 85 |
+
peptide_bond_mol = Chem.MolFromSmarts("[N,n][C,c]C(=O)[*!O]") # [*!O] ensures it does not match AAter
|
| 86 |
+
edge_C_position = 2
|
| 87 |
+
edge_N_position = 4
|
| 88 |
+
valid_backbone = Chem.MolFromSmarts("[NH,NH2]CC(=O)[OH]")
|
| 89 |
+
loose_backbone = Chem.MolFromSmarts("[C,c](C(=O)O)[N,n]") # Also detects backbone that contains a benzene ring. Used for removing -OH
|
| 90 |
+
OH_position = 3
|
| 91 |
+
oxygen = Chem.Atom(8)
|
| 92 |
+
|
| 93 |
+
three2one_letter = {
|
| 94 |
+
"Ala": "A",
|
| 95 |
+
"Gly": "G",
|
| 96 |
+
"Ile": "I",
|
| 97 |
+
"Leu": "L",
|
| 98 |
+
"Pro": "P",
|
| 99 |
+
"Val": "V",
|
| 100 |
+
"Phe": "F",
|
| 101 |
+
"Trp": "W",
|
| 102 |
+
"Tyr": "Y",
|
| 103 |
+
"Asp": "D",
|
| 104 |
+
"Glu": "E",
|
| 105 |
+
"Arg": "R",
|
| 106 |
+
"His": "H",
|
| 107 |
+
"Lys": "K",
|
| 108 |
+
"Ser": "S",
|
| 109 |
+
"Thr": "T",
|
| 110 |
+
"Cys": "C",
|
| 111 |
+
"Met": "M",
|
| 112 |
+
"Asn": "N",
|
| 113 |
+
"Gln": "Q",
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
aa2color_dict = {
|
| 117 |
+
"Asp": (0.902, 0.039, 0.039),
|
| 118 |
+
"Glu": (0.961, 0.1, 0.537),
|
| 119 |
+
"Arg": (0.078, 0.353, 1),
|
| 120 |
+
"Lys": (0.42, 0.353, 1),
|
| 121 |
+
"His": (0.51, 0.51, 0.824),
|
| 122 |
+
"Tyr": (0.196, 0.196, 0.667),
|
| 123 |
+
"Phe": (0.341, 0.196, 0.667),
|
| 124 |
+
"Trp": (0.706, 0.353, 0.706),
|
| 125 |
+
"Asn": (0, 0.863, 0.863),
|
| 126 |
+
"Gln": (0.5, 0.82, 0.863),
|
| 127 |
+
"Met": (0.902, 0.902, 0),
|
| 128 |
+
"Cys": (0.722, 0.902, 0),
|
| 129 |
+
"Ser": (0.98, 0.588, 0),
|
| 130 |
+
"Thr": (0, 0.612, 0.412),
|
| 131 |
+
"Gly": (0.98, 0.922, 0.922),
|
| 132 |
+
"Ala": (0.784, 0.784, 0.639),
|
| 133 |
+
"Val": (0.059, 0.51, 0.059),
|
| 134 |
+
"Leu": (0.29, 0.51, 0.059),
|
| 135 |
+
"Ile": (0.29, 0.51, 0.471),
|
| 136 |
+
"Pro": (1, 0.588, 0.51),
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
# no integer in the tuple was already matched
|
| 140 |
+
def tuple_fully_unmatched(indexes_group, already_matched, mol_a):
|
| 141 |
+
res = True
|
| 142 |
+
for i in indexes_group:
|
| 143 |
+
if mol_a.GetAtomWithIdx(i).HasProp("AA") and mol_a.GetAtomWithIdx(i).GetProp(
|
| 144 |
+
"AA"
|
| 145 |
+
).startswith("Unk"):
|
| 146 |
+
res = False
|
| 147 |
+
break
|
| 148 |
+
if i in already_matched:
|
| 149 |
+
res = False
|
| 150 |
+
break
|
| 151 |
+
return res
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def match_AA(mol_b, dict):
|
| 155 |
+
atoms_already_matched = set()
|
| 156 |
+
for aa_name, aa_mol in dict.items():
|
| 157 |
+
i = 0
|
| 158 |
+
for atom_indexes_group in mol_b.GetSubstructMatches(aa_mol, useChirality=True):
|
| 159 |
+
prop = aa_name + ":" + str(i)
|
| 160 |
+
if tuple_fully_unmatched(atom_indexes_group, atoms_already_matched, mol_b):
|
| 161 |
+
for a_i in atom_indexes_group:
|
| 162 |
+
mol_b.GetAtomWithIdx(a_i).SetProp("AA", prop)
|
| 163 |
+
atoms_already_matched.add(a_i)
|
| 164 |
+
i += 1
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def find_peptide_bonds(mol_c):
|
| 168 |
+
atom_indices_surrounding_peptide_bond = []
|
| 169 |
+
for bonded_AA in mol_c.GetSubstructMatches(peptide_bond_mol):
|
| 170 |
+
C_idx = mol_c.GetAtomWithIdx(bonded_AA[edge_C_position]).GetIdx()
|
| 171 |
+
N_idx = mol_c.GetAtomWithIdx(bonded_AA[edge_N_position]).GetIdx()
|
| 172 |
+
atom_indices_surrounding_peptide_bond.append([C_idx, N_idx])
|
| 173 |
+
return atom_indices_surrounding_peptide_bond
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def set_peptide_bond_prop(mol, atom_indices_surrounding_peptide_bond):
|
| 177 |
+
peptide_bonds = []
|
| 178 |
+
for C_idx, N_idx in atom_indices_surrounding_peptide_bond:
|
| 179 |
+
mol.GetAtomWithIdx(C_idx).SetProp("bond_site", "C")
|
| 180 |
+
mol.GetAtomWithIdx(N_idx).SetProp("bond_site", "N")
|
| 181 |
+
peptide_bond = mol.GetBondBetweenAtoms(C_idx, N_idx)
|
| 182 |
+
peptide_bond.SetProp("bondNote", "8<")
|
| 183 |
+
peptide_bond.SetProp("peptide_bond", "peptide_bond")
|
| 184 |
+
peptide_bonds.append(peptide_bond.GetIdx())
|
| 185 |
+
return peptide_bonds
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def label_peptide_bonds(mol_e):
|
| 189 |
+
atom_indices_surrounding_peptide_bond = find_peptide_bonds(mol_e)
|
| 190 |
+
peptide_bonds = set_peptide_bond_prop(mol_e, atom_indices_surrounding_peptide_bond)
|
| 191 |
+
return peptide_bonds
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def label_NNAAs(mol_e, peptide_bonds):
|
| 195 |
+
NNAA_idx = 0
|
| 196 |
+
for a_i in range(mol_e.GetNumHeavyAtoms()):
|
| 197 |
+
the_atom = mol_e.GetAtomWithIdx(a_i)
|
| 198 |
+
if not the_atom.HasProp("AA"):
|
| 199 |
+
atom_index_of_the_NNAA = the_atom.GetIdx()
|
| 200 |
+
label_unmatched_NNAA(
|
| 201 |
+
mol_e, atom_index_of_the_NNAA, NNAA_idx, peptide_bonds
|
| 202 |
+
)
|
| 203 |
+
NNAA_idx += 1
|
| 204 |
+
return NNAA_idx
|
| 205 |
+
|
| 206 |
+
def prepare_graph(first_atom_index):
|
| 207 |
+
queue = deque([first_atom_index])
|
| 208 |
+
visited = set([first_atom_index])
|
| 209 |
+
return queue, visited
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def enqueue_neighbor_indices(mol_f, atom, queue, visited):
|
| 213 |
+
neighbor_indices = [neighbor[1] for neighbor in get_neighbors(mol_f, atom)]
|
| 214 |
+
for neighbor_atom_idx in neighbor_indices:
|
| 215 |
+
if neighbor_atom_idx not in visited:
|
| 216 |
+
queue.append(neighbor_atom_idx)
|
| 217 |
+
visited.add(neighbor_atom_idx)
|
| 218 |
+
return queue, visited
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def get_neighbors(mol_g, atom):
|
| 222 |
+
neighbors_and_indices = []
|
| 223 |
+
for neighbor_atom in atom.GetNeighbors():
|
| 224 |
+
neighbor_atom_idx = neighbor_atom.GetIdx()
|
| 225 |
+
neighbor_atom = mol_g.GetAtomWithIdx(neighbor_atom_idx)
|
| 226 |
+
neighbors_and_indices.append([neighbor_atom, neighbor_atom_idx])
|
| 227 |
+
return neighbors_and_indices
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def cross_peptide_bond(mol_f, current_atom_idx, neighbor_idx, peptide_bonds):
|
| 231 |
+
bond_i = mol_f.GetBondBetweenAtoms(current_atom_idx, neighbor_idx).GetIdx()
|
| 232 |
+
return bond_i in peptide_bonds
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def NNAA_continues(neighbor_atom, first_AA_observed):
|
| 236 |
+
return (
|
| 237 |
+
neighbor_atom.HasProp("AA") == False
|
| 238 |
+
or neighbor_atom.GetProp("AA") == first_AA_observed
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def get_current_atom_with_prop(mol_h, atom_idx_queue, prop):
|
| 243 |
+
current_atom_idx = atom_idx_queue.popleft()
|
| 244 |
+
current_atom = mol_h.GetAtomWithIdx(current_atom_idx)
|
| 245 |
+
current_atom.SetProp("AA", prop)
|
| 246 |
+
return current_atom, current_atom_idx
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def label_unmatched_NNAA(mol, atom_index_of_the_NNAA, NNAA_idx, peptide_bonds):
|
| 250 |
+
atom_idx_queue, visited_atoms = prepare_graph(atom_index_of_the_NNAA)
|
| 251 |
+
first_AA_observed = None
|
| 252 |
+
prop = f"Unk{NNAA_idx}"
|
| 253 |
+
while atom_idx_queue:
|
| 254 |
+
current_atom, current_atom_idx = get_current_atom_with_prop(
|
| 255 |
+
mol, atom_idx_queue, prop
|
| 256 |
+
)
|
| 257 |
+
neighbors_and_indices = get_neighbors(mol, current_atom)
|
| 258 |
+
for neighbor in neighbors_and_indices:
|
| 259 |
+
neighbor_atom, neighbor_idx = neighbor
|
| 260 |
+
if neighbor_idx not in visited_atoms and not cross_peptide_bond(
|
| 261 |
+
mol, current_atom_idx, neighbor_idx, peptide_bonds
|
| 262 |
+
):
|
| 263 |
+
visited_atoms.add(neighbor_idx)
|
| 264 |
+
if NNAA_continues(neighbor_atom, first_AA_observed):
|
| 265 |
+
atom_idx_queue.append(neighbor_idx)
|
| 266 |
+
elif first_AA_observed is None: # first_AA_observed unseen
|
| 267 |
+
first_AA_observed = neighbor_atom.GetProp("AA")
|
| 268 |
+
atom_idx_queue.append(neighbor_idx)
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def get_first_base_aa(mol_j, first_atom_index):
|
| 272 |
+
first_atom = mol_j.GetAtomWithIdx(first_atom_index)
|
| 273 |
+
current_base_aa = first_atom.GetProp("AA")
|
| 274 |
+
return current_base_aa
|
| 275 |
+
|
| 276 |
+
def label_boundary_bonds(mol):
|
| 277 |
+
for bond in mol.GetBonds():
|
| 278 |
+
atom1_i, atom2_i, prop1, prop2 = get_connected_atoms_and_props(bond, mol)
|
| 279 |
+
if (
|
| 280 |
+
prop1 != prop2
|
| 281 |
+
):
|
| 282 |
+
bond.SetProp("boundary", "boundary")
|
| 283 |
+
mol.GetAtomWithIdx(atom1_i).SetProp("bond_site", "bond_site")
|
| 284 |
+
mol.GetAtomWithIdx(atom2_i).SetProp("bond_site", "bond_site")
|
| 285 |
+
|
| 286 |
+
def add_order_to_atomNote(mol_v, aa_order, current_base_aa):
|
| 287 |
+
for atom_idx in range(mol_v.GetNumAtoms()):
|
| 288 |
+
atom = mol_v.GetAtomWithIdx(atom_idx)
|
| 289 |
+
if atom.GetProp("AA") == current_base_aa:
|
| 290 |
+
atom.SetProp("atomNote", f"{current_base_aa}:{aa_order}")
|
| 291 |
+
|
| 292 |
+
def reorder_AAs(mol_k, first_atom_index):
|
| 293 |
+
atom_idx_queue, visited_atom_indices = prepare_graph(first_atom_index)
|
| 294 |
+
aa_list = []
|
| 295 |
+
aa_order = 1
|
| 296 |
+
current_base_aa = get_first_base_aa(mol_k, first_atom_index)
|
| 297 |
+
|
| 298 |
+
while atom_idx_queue:
|
| 299 |
+
add_order_to_atomNote(mol_k, aa_order, current_base_aa)
|
| 300 |
+
atom_index = atom_idx_queue.popleft()
|
| 301 |
+
the_atom = mol_k.GetAtomWithIdx(atom_index)
|
| 302 |
+
aa_in_question = the_atom.GetProp("AA")
|
| 303 |
+
if current_base_aa != aa_in_question:
|
| 304 |
+
current_base_aa, atom_idx_queue = switch_base_and_empty_queue(
|
| 305 |
+
aa_list, current_base_aa, aa_in_question, atom_idx_queue, atom_index
|
| 306 |
+
)
|
| 307 |
+
aa_order += 1
|
| 308 |
+
enqueue_neighbor_indices(mol_k, the_atom, atom_idx_queue, visited_atom_indices)
|
| 309 |
+
|
| 310 |
+
aa_list.append(current_base_aa) # append the last AA
|
| 311 |
+
return aa_list
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def switch_base_and_empty_queue(
|
| 315 |
+
aa_list, current_base_aa, aa_in_question, atom_idx_queue, idx
|
| 316 |
+
):
|
| 317 |
+
aa_list.append(current_base_aa)
|
| 318 |
+
current_base_aa = aa_in_question
|
| 319 |
+
atom_idx_queue = deque([idx])
|
| 320 |
+
return current_base_aa, atom_idx_queue
|
| 321 |
+
|
| 322 |
+
def label_belongs_to_AA(label):
|
| 323 |
+
shorter_label = label[:3]
|
| 324 |
+
return shorter_label != "Unk" and not label.startswith("X")
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def record_if_terminal(peptide_bonded_props, peptide_bonded_atoms, prop, atom):
|
| 328 |
+
if (
|
| 329 |
+
prop in peptide_bonded_props
|
| 330 |
+
): # the peptide bond was seen twice i.e. it has both ends
|
| 331 |
+
peptide_bonded_props.remove(prop)
|
| 332 |
+
else:
|
| 333 |
+
peptide_bonded_props.append(prop)
|
| 334 |
+
peptide_bonded_atoms.append(atom)
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
def get_first_atom_index(mol_l, peptide_bonded_props, peptide_bonded_atoms):
|
| 338 |
+
first_atom_index = 0
|
| 339 |
+
for a_i in range(mol_l.GetNumAtoms()):
|
| 340 |
+
a = mol_l.GetAtomWithIdx(a_i)
|
| 341 |
+
if (
|
| 342 |
+
a_i in peptide_bonded_atoms
|
| 343 |
+
and a.GetProp("AA") in peptide_bonded_props
|
| 344 |
+
and a.GetSymbol() == "C"
|
| 345 |
+
):
|
| 346 |
+
first_atom_index = a_i
|
| 347 |
+
break
|
| 348 |
+
return first_atom_index
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
def mol_is_cyclic_peptide(mol_u, ignore_cyclic_peptide):
|
| 352 |
+
if ignore_cyclic_peptide == False:
|
| 353 |
+
return False
|
| 354 |
+
for bond in mol_u.GetBonds(): # for any bond including peptide bonds
|
| 355 |
+
if bond.IsInRing() and (bond.HasProp("boundary") or bond.HasProp("peptide_bond")):
|
| 356 |
+
return True
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
def search_terminal_AA(mol_m): # for highlight and searching terminal AA
|
| 360 |
+
peptide_bonded_props, peptide_bonded_atoms = [], []
|
| 361 |
+
for bond in mol_m.GetBonds(): # for any bond including peptide bonds
|
| 362 |
+
atom1_i, atom2_i, prop1, prop2 = get_connected_atoms_and_props(bond, mol_m)
|
| 363 |
+
if bond.HasProp(
|
| 364 |
+
"peptide_bond"
|
| 365 |
+
): # will remain in the list only if it is connected to a terminal AA
|
| 366 |
+
record_if_terminal(
|
| 367 |
+
peptide_bonded_props, peptide_bonded_atoms, prop1, atom1_i
|
| 368 |
+
)
|
| 369 |
+
record_if_terminal(
|
| 370 |
+
peptide_bonded_props, peptide_bonded_atoms, prop2, atom2_i
|
| 371 |
+
)
|
| 372 |
+
return peptide_bonded_props, peptide_bonded_atoms
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
def get_connected_atoms_and_props(bond, mol_t):
|
| 376 |
+
atom1_i, atom2_i = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
|
| 377 |
+
prop1, prop2 = mol_t.GetAtomWithIdx(atom1_i).GetProp("AA"), mol_t.GetAtomWithIdx(
|
| 378 |
+
atom2_i
|
| 379 |
+
).GetProp("AA")
|
| 380 |
+
return atom1_i, atom2_i, prop1, prop2
|
| 381 |
+
|
| 382 |
+
def write_seq(aa_list):
|
| 383 |
+
split_seq = []
|
| 384 |
+
for aa in aa_list:
|
| 385 |
+
if aa[:3] == ("Unk"):
|
| 386 |
+
acid = "?"
|
| 387 |
+
elif aa.startswith("X"):
|
| 388 |
+
acid = aa.split(":")[0]
|
| 389 |
+
else:
|
| 390 |
+
acid = three2one_letter[aa[:3]]
|
| 391 |
+
split_seq.append(acid)
|
| 392 |
+
return split_seq
|
| 393 |
+
|
| 394 |
+
def get_NNAAs(mol):
|
| 395 |
+
rwmol = Chem.RWMol(mol)
|
| 396 |
+
remove_peptide_bonds(rwmol) # this needs to come before remove_atoms
|
| 397 |
+
remove_atoms(rwmol, mol, label_belongs_to_AA)
|
| 398 |
+
try:
|
| 399 |
+
return Chem.GetMolFrags(rwmol, asMols=True, sanitizeFrags=True)
|
| 400 |
+
except ValueError:
|
| 401 |
+
return "error"
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
def remove_atoms(rwmol, mol, func, **kwargs):
|
| 405 |
+
atom_number = mol.GetNumAtoms() - 1
|
| 406 |
+
while atom_number >= 0:
|
| 407 |
+
prop = rwmol.GetAtomWithIdx(atom_number).GetProp("AA")
|
| 408 |
+
if func(prop, **kwargs):
|
| 409 |
+
rwmol.RemoveAtom(atom_number)
|
| 410 |
+
atom_number -= 1
|
| 411 |
+
|
| 412 |
+
def add_OH(rwmol, begin_atom_idx, end_atom_idx):
|
| 413 |
+
rwmol.AddAtom(oxygen)
|
| 414 |
+
oxygen_idx = rwmol.GetNumAtoms() -1
|
| 415 |
+
if rwmol.GetAtomWithIdx(begin_atom_idx).GetAtomicNum() == 6: # Carbon
|
| 416 |
+
rwmol.AddBond(begin_atom_idx, oxygen_idx, Chem.BondType.SINGLE)
|
| 417 |
+
elif rwmol.GetAtomWithIdx(end_atom_idx).GetAtomicNum() == 6: # Carbon
|
| 418 |
+
rwmol.AddBond(oxygen_idx, end_atom_idx, Chem.BondType.SINGLE)
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
def remove_peptide_bonds(rwmol):
|
| 422 |
+
current_bond_idx = rwmol.GetNumBonds() - 1
|
| 423 |
+
while current_bond_idx >= 0:
|
| 424 |
+
current_bond = rwmol.GetBondWithIdx(current_bond_idx)
|
| 425 |
+
if current_bond.HasProp("peptide_bond") and current_bond.IsInRing() == False:
|
| 426 |
+
begin_atom_idx, end_atom_idx = current_bond.GetBeginAtomIdx(), current_bond.GetEndAtomIdx()
|
| 427 |
+
rwmol.RemoveBond(
|
| 428 |
+
begin_atom_idx, end_atom_idx
|
| 429 |
+
)
|
| 430 |
+
add_OH(rwmol, begin_atom_idx, end_atom_idx)
|
| 431 |
+
current_bond_idx -= 1
|
| 432 |
+
|
| 433 |
+
def detect_terminal(NNAA):
|
| 434 |
+
if NNAA.HasSubstructMatch(valid_backbone):
|
| 435 |
+
return "NotTer"
|
| 436 |
+
else:
|
| 437 |
+
return "ter" # don't use capital letter, for tokenization
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
def enlist_NNAA(new_NNAA, df, ter_or_not, bond_atom_indices):
|
| 441 |
+
new_smi = Chem.MolToSmiles(new_NNAA, isomericSmiles=True, canonical=True)
|
| 442 |
+
new_smi_rootedAtAtom0 = Chem.MolToSmiles(new_NNAA, isomericSmiles=True, canonical=True, rootedAtAtom=0)
|
| 443 |
+
bond_atom_indices = [new_smi_rootedAtAtom0] + bond_atom_indices
|
| 444 |
+
new_data = pd.DataFrame({
|
| 445 |
+
'SMILES': [new_smi],
|
| 446 |
+
'TERMINAL': [ter_or_not],
|
| 447 |
+
'BOND SITES': [bond_atom_indices],
|
| 448 |
+
'MOL': [new_NNAA]
|
| 449 |
+
})
|
| 450 |
+
|
| 451 |
+
df = pd.concat([df, new_data], ignore_index=True)
|
| 452 |
+
|
| 453 |
+
# deduplicate by SMILES
|
| 454 |
+
df = df.drop_duplicates(subset=['SMILES'])
|
| 455 |
+
|
| 456 |
+
return df
|
| 457 |
+
|
| 458 |
+
def add_IDs(df):
|
| 459 |
+
# group df by TAUTOMER HASH
|
| 460 |
+
tautomer_groups = df['TAUTOMER HASH'].drop_duplicates().reset_index(drop=True)
|
| 461 |
+
|
| 462 |
+
for i, tautomer_hash in enumerate(tautomer_groups):
|
| 463 |
+
df.loc[df['TAUTOMER HASH'] == tautomer_hash, 'ID'] = f"X{i}"
|
| 464 |
+
|
| 465 |
+
# if ['TERMINAL'] == 'ter', add 'ter' to the ID
|
| 466 |
+
df.loc[df['TERMINAL'] == 'ter', 'ID'] = df['ID'] + 'ter'
|
| 467 |
+
|
| 468 |
+
return df
|
| 469 |
+
|
| 470 |
+
def relabel_NNAA(mol, NNAA_df):
|
| 471 |
+
visited_Unk_labels, visited_NNAA_labels = [], []
|
| 472 |
+
for atom_idx in range(mol.GetNumAtoms()):
|
| 473 |
+
try:
|
| 474 |
+
label = mol.GetAtomWithIdx(atom_idx).GetProp("AA")
|
| 475 |
+
if label.startswith("Unk") and label not in visited_Unk_labels:
|
| 476 |
+
visited_Unk_labels.append(label)
|
| 477 |
+
rwmol_from_peptide = Chem.RWMol(mol)
|
| 478 |
+
remove_atoms(rwmol_from_peptide, mol, different_NNAA, Unk_label=label)
|
| 479 |
+
for idx, NNAA_row in NNAA_df.iterrows():
|
| 480 |
+
if perfect_match(rwmol_from_peptide, NNAA_row['MOL']):
|
| 481 |
+
nnaa_name = NNAA_row['ID']
|
| 482 |
+
seen_times = visited_NNAA_labels.count(nnaa_name)
|
| 483 |
+
nnaa_prop = f"{nnaa_name}:{seen_times}"
|
| 484 |
+
mol = relabel_prop(mol, label, nnaa_prop)
|
| 485 |
+
visited_NNAA_labels.append(nnaa_name)
|
| 486 |
+
break
|
| 487 |
+
except:
|
| 488 |
+
continue
|
| 489 |
+
return mol
|
| 490 |
+
|
| 491 |
+
def different_NNAA(label, Unk_label):
|
| 492 |
+
return label != Unk_label
|
| 493 |
+
|
| 494 |
+
|
| 495 |
+
def relabel_prop(mol, label, nnaa_name):
|
| 496 |
+
for atom_idx in range(mol.GetNumAtoms()):
|
| 497 |
+
try:
|
| 498 |
+
atom = mol.GetAtomWithIdx(atom_idx)
|
| 499 |
+
if atom.HasProp("AA") and atom.GetProp("AA") == label:
|
| 500 |
+
atom.SetProp("AA", nnaa_name)
|
| 501 |
+
except:
|
| 502 |
+
continue
|
| 503 |
+
return mol
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
def perfect_match(rwmol_NNAA, nnaa_mol):
|
| 507 |
+
return (
|
| 508 |
+
rwmol_NNAA.HasSubstructMatch(nnaa_mol, useChirality=True)
|
| 509 |
+
and nnaa_mol.GetNumAtoms() == rwmol_NNAA.GetNumAtoms()
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
def NNAAs_with_OH_removed(NNAA_df):
|
| 513 |
+
new_rows = [] # List to store the new rows
|
| 514 |
+
|
| 515 |
+
for _, row in NNAA_df.iterrows():
|
| 516 |
+
mol = row['MOL']
|
| 517 |
+
rwmol_NNAA = Chem.RWMol(mol)
|
| 518 |
+
backbone_indices = rwmol_NNAA.GetSubstructMatches(loose_backbone)
|
| 519 |
+
|
| 520 |
+
for backbone_index in backbone_indices:
|
| 521 |
+
OH_atom_i = backbone_index[OH_position]
|
| 522 |
+
rwmol_NNAA.GetAtomWithIdx(OH_atom_i).SetProp("ToBeRemoved", "ToBeRemoved")
|
| 523 |
+
|
| 524 |
+
num_atoms = rwmol_NNAA.GetNumAtoms() - 1
|
| 525 |
+
while num_atoms >= 0:
|
| 526 |
+
if rwmol_NNAA.GetAtomWithIdx(num_atoms).HasProp("ToBeRemoved"):
|
| 527 |
+
rwmol_NNAA.RemoveAtom(num_atoms)
|
| 528 |
+
result_mol = rwmol_NNAA.GetMol()
|
| 529 |
+
|
| 530 |
+
# Add a new row to new_rows with the same data except for the modified 'MOL'
|
| 531 |
+
new_row = row.copy()
|
| 532 |
+
new_row['MOL'] = result_mol
|
| 533 |
+
new_rows.append(new_row)
|
| 534 |
+
|
| 535 |
+
num_atoms -= 1
|
| 536 |
+
num_atoms -= 1
|
| 537 |
+
|
| 538 |
+
# Convert new_rows to a DataFrame and concatenate with the original NNAA_df
|
| 539 |
+
new_rows_df = pd.DataFrame(new_rows)
|
| 540 |
+
NNAA_df = pd.concat([NNAA_df, new_rows_df], ignore_index=True)
|
| 541 |
+
|
| 542 |
+
return NNAA_df
|
| 543 |
+
|
| 544 |
+
def remove_small_substructs(mol):
|
| 545 |
+
substructures = Chem.GetMolFrags(mol, asMols=True)
|
| 546 |
+
if len(substructures) <= 1:
|
| 547 |
+
return mol, False
|
| 548 |
+
else:
|
| 549 |
+
error = "Multiple substructures. Removing the smaller ones."
|
| 550 |
+
substructure_sizes = [sub.GetNumAtoms() for sub in substructures]
|
| 551 |
+
largest_substructure_index = substructure_sizes.index(max(substructure_sizes))
|
| 552 |
+
for i in range(len(substructures)):
|
| 553 |
+
if i != largest_substructure_index:
|
| 554 |
+
modified_mol = Chem.DeleteSubstructs(mol, substructures[i])
|
| 555 |
+
return modified_mol, error
|
| 556 |
+
|
| 557 |
+
def has_unlabelled_atom(mol, seq_list):
|
| 558 |
+
if "?" in seq_list:
|
| 559 |
+
return True
|
| 560 |
+
for atom in mol.GetAtoms():
|
| 561 |
+
if not atom.HasProp("AA"):
|
| 562 |
+
return True
|
| 563 |
+
return False
|
| 564 |
+
|
| 565 |
+
def linear(peptide_bonds, aminos):
|
| 566 |
+
return len(peptide_bonds) == len(aminos) - 1
|
| 567 |
+
|
| 568 |
+
def ter_in_the_middle(seq_list):
|
| 569 |
+
for i, amino in enumerate(seq_list):
|
| 570 |
+
if amino.endswith("ter") and i != 0 and i != len(seq_list) - 1:
|
| 571 |
+
return True
|
| 572 |
+
|
| 573 |
+
def filter_out(seq_list, mol, peptide_bonds):
|
| 574 |
+
if not linear(peptide_bonds, seq_list):
|
| 575 |
+
return "Not linear"
|
| 576 |
+
if has_unlabelled_atom(mol, seq_list):
|
| 577 |
+
return "Has unlabelled atom"
|
| 578 |
+
if ter_in_the_middle(seq_list):
|
| 579 |
+
return "Terminal amino acid in the middle"
|
| 580 |
+
return False
|
| 581 |
+
|
| 582 |
+
def record_bond_sites(NNAA):
|
| 583 |
+
indices = []
|
| 584 |
+
for atom in NNAA.GetAtoms():
|
| 585 |
+
if atom.HasProp("bond_site"):
|
| 586 |
+
indices.append(atom.GetIdx())
|
| 587 |
+
return indices
|
| 588 |
+
|
| 589 |
+
def count_aminos(split_seq, NNAA_counts):
|
| 590 |
+
for amino in split_seq:
|
| 591 |
+
# Count the number of times each NNAA is seen in the output sequences
|
| 592 |
+
if amino.startswith("X"):
|
| 593 |
+
if amino in NNAA_counts:
|
| 594 |
+
NNAA_counts[amino] += 1
|
| 595 |
+
else:
|
| 596 |
+
NNAA_counts[amino] = 1
|
| 597 |
+
return NNAA_counts
|
| 598 |
+
|
| 599 |
+
def load_data(input_file):
|
| 600 |
+
# Load the data
|
| 601 |
+
print("0/4 Loading input data...")
|
| 602 |
+
df = pd.read_csv(input_file, sep='\t', on_bad_lines='warn')
|
| 603 |
+
|
| 604 |
+
# Check if the 'ID' column exists
|
| 605 |
+
if 'ID' not in df.columns:
|
| 606 |
+
df['ID'] = range(1, len(df) + 1) # Create an 'ID' column with unique sequential numbers
|
| 607 |
+
|
| 608 |
+
# Check if the 'ISOSMILES' column exists
|
| 609 |
+
if 'ISOSMILES' not in df.columns:
|
| 610 |
+
df['ISOSMILES'] = None # Create an empty 'ISOSMILES' column if it doesn't exist
|
| 611 |
+
|
| 612 |
+
# Check if the 'SMILES' column exists
|
| 613 |
+
if 'SMILES' not in df.columns:
|
| 614 |
+
df['SMILES'] = None # Create an empty 'SMILES' column if it doesn't exist
|
| 615 |
+
|
| 616 |
+
# Determine which column to use for the SMILES
|
| 617 |
+
df['SMILES'] = df['ISOSMILES'].fillna(df['SMILES']).str.strip()
|
| 618 |
+
|
| 619 |
+
# Remove rows where both 'ISOSMILES' and 'SMILES' are missing or empty
|
| 620 |
+
df = df[df['SMILES'].ne("")]
|
| 621 |
+
|
| 622 |
+
# Drop ISOSMILES column
|
| 623 |
+
df = df.drop(columns=['ISOSMILES'])
|
| 624 |
+
|
| 625 |
+
# drop rows where 'SMILES' is empty
|
| 626 |
+
df = df[(df['SMILES'] != '') & (df['SMILES'].notna())]
|
| 627 |
+
|
| 628 |
+
# convert to a dataframe
|
| 629 |
+
df = pd.DataFrame(df)
|
| 630 |
+
return df
|
| 631 |
+
|
| 632 |
+
def process_molecule_batch(batch_df, smi2mol, ignore_cyclic_peptide, min_amino_acids, progress_bar):
|
| 633 |
+
local_mol_data = []
|
| 634 |
+
|
| 635 |
+
for mol_index, row in batch_df.iterrows():
|
| 636 |
+
try:
|
| 637 |
+
smi = row['SMILES']
|
| 638 |
+
if not smi:
|
| 639 |
+
local_mol_data.append((mol_index, None, None, "No SMILES provided", None, None))
|
| 640 |
+
continue
|
| 641 |
+
|
| 642 |
+
mol = Chem.MolFromSmiles(smi)
|
| 643 |
+
if mol is None:
|
| 644 |
+
local_mol_data.append((mol_index, None, None, "Invalid SMILES", None, None))
|
| 645 |
+
continue
|
| 646 |
+
|
| 647 |
+
mol, error = remove_small_substructs(mol)
|
| 648 |
+
if error:
|
| 649 |
+
local_mol_data.append((mol_index, None, None, error, None, None))
|
| 650 |
+
continue
|
| 651 |
+
|
| 652 |
+
match_AA(mol, smi2mol)
|
| 653 |
+
peptide_bonds = label_peptide_bonds(mol)
|
| 654 |
+
|
| 655 |
+
if len(peptide_bonds) < min_amino_acids - 1:
|
| 656 |
+
local_mol_data.append((mol_index, None, None, "Not enough amino acids", None, None))
|
| 657 |
+
continue
|
| 658 |
+
|
| 659 |
+
num_NNAAs = label_NNAAs(mol, peptide_bonds)
|
| 660 |
+
all_AA = num_NNAAs == 0
|
| 661 |
+
|
| 662 |
+
label_boundary_bonds(mol)
|
| 663 |
+
|
| 664 |
+
if mol_is_cyclic_peptide(mol, ignore_cyclic_peptide):
|
| 665 |
+
local_mol_data.append((mol_index, None, None, "Cyclic peptide", None, None))
|
| 666 |
+
continue
|
| 667 |
+
|
| 668 |
+
NNAAs_info = []
|
| 669 |
+
if not all_AA:
|
| 670 |
+
NNAAs = get_NNAAs(mol)
|
| 671 |
+
if NNAAs == "error":
|
| 672 |
+
local_mol_data.append((mol_index, None, None, "Disconnected molecule", None, None))
|
| 673 |
+
continue
|
| 674 |
+
else:
|
| 675 |
+
for NNAA in NNAAs:
|
| 676 |
+
ter_or_not = detect_terminal(NNAA)
|
| 677 |
+
bond_sites = record_bond_sites(NNAA)
|
| 678 |
+
NNAAs_info.append((NNAA, ter_or_not, bond_sites))
|
| 679 |
+
|
| 680 |
+
local_mol_data.append((mol_index, mol, all_AA, None, peptide_bonds, NNAAs_info))
|
| 681 |
+
|
| 682 |
+
except:
|
| 683 |
+
local_mol_data.append((mol_index, None, None, "Unknown error", None, None))
|
| 684 |
+
|
| 685 |
+
progress_bar.update(1)
|
| 686 |
+
return local_mol_data
|
| 687 |
+
|
| 688 |
+
def label_molecules_in_batches(mol_df, batch_size, smi2mol, ignore_cyclic_peptide, min_amino_acids, max_workers):
|
| 689 |
+
# Initialize columns and dataframes
|
| 690 |
+
mol_df[['ERROR', 'MOL', 'ALL AA', 'PEPTIDE BONDS']] = ["", "", False, ""]
|
| 691 |
+
NNAA_df = pd.DataFrame(columns=['ID', 'SMILES', 'TERMINAL', 'BOND SITES'])
|
| 692 |
+
|
| 693 |
+
indices = list(mol_df.index)
|
| 694 |
+
batches = [indices[i:i + batch_size] for i in range(0, len(indices), batch_size)]
|
| 695 |
+
futures = []
|
| 696 |
+
progress_bar = tqdm(total=len(indices) // batch_size, desc="1/4 Labelling molecules", leave=True)
|
| 697 |
+
|
| 698 |
+
# Use ThreadPoolExecutor for parallel batch processing
|
| 699 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 700 |
+
for batch_indices in batches:
|
| 701 |
+
batch_df = mol_df.loc[batch_indices]
|
| 702 |
+
futures.append(
|
| 703 |
+
executor.submit(process_molecule_batch, batch_df, smi2mol, ignore_cyclic_peptide, min_amino_acids, progress_bar)
|
| 704 |
+
)
|
| 705 |
+
|
| 706 |
+
progress_bar.close()
|
| 707 |
+
|
| 708 |
+
with tqdm(total=len(mol_df), desc="2/4 Storing NNAAs") as pbar:
|
| 709 |
+
for future in as_completed(futures):
|
| 710 |
+
batch_results = future.result()
|
| 711 |
+
|
| 712 |
+
for mol_index, mol, all_AA, error, peptide_bonds, NNAAs_info in batch_results:
|
| 713 |
+
if mol is None:
|
| 714 |
+
mol_df.at[mol_index, 'ERROR'] = error
|
| 715 |
+
continue
|
| 716 |
+
|
| 717 |
+
mol_df.at[mol_index, 'MOL'] = mol
|
| 718 |
+
mol_df.at[mol_index, 'ALL AA'] = all_AA
|
| 719 |
+
mol_df.at[mol_index, 'PEPTIDE BONDS'] = peptide_bonds
|
| 720 |
+
|
| 721 |
+
if NNAAs_info:
|
| 722 |
+
for NNAA, ter_or_not, bond_sites in NNAAs_info:
|
| 723 |
+
NNAA_df = enlist_NNAA(NNAA, NNAA_df, ter_or_not, bond_sites)
|
| 724 |
+
|
| 725 |
+
pbar.update(len(batch_results))
|
| 726 |
+
|
| 727 |
+
return NNAA_df, mol_df
|
| 728 |
+
|
| 729 |
+
def highlight_bonds_with_AA(mol_s): # with AA colors
|
| 730 |
+
bond_highlights = defaultdict(lambda: [])
|
| 731 |
+
for bond in mol_s.GetBonds():
|
| 732 |
+
atom1_i, atom2_i, prop1, prop2 = get_connected_atoms_and_props(bond, mol_s)
|
| 733 |
+
if (label_belongs_to_AA(prop1) and prop1 == prop2): # if the bond is within the same AA
|
| 734 |
+
bond_highlights[bond.GetIdx()].append(aa2color_dict[prop1[:3]])
|
| 735 |
+
return bond_highlights
|
| 736 |
+
|
| 737 |
+
|
| 738 |
+
def relabel_batch(mol_df, NNAA_df):
|
| 739 |
+
# Initialize a list to collect row data
|
| 740 |
+
local_mol_data = []
|
| 741 |
+
|
| 742 |
+
for _, row in mol_df.iterrows():
|
| 743 |
+
mol_index = row['ID']
|
| 744 |
+
mol = row['MOL']
|
| 745 |
+
all_AA = row['ALL AA']
|
| 746 |
+
peptide_bonds = row['PEPTIDE BONDS']
|
| 747 |
+
|
| 748 |
+
try:
|
| 749 |
+
# Process molecule if not all amino acids are labeled
|
| 750 |
+
if not all_AA:
|
| 751 |
+
mol = relabel_NNAA(mol, NNAA_df)
|
| 752 |
+
|
| 753 |
+
# Perform various processing tasks
|
| 754 |
+
bond_highlights = highlight_bonds_with_AA(mol)
|
| 755 |
+
peptide_bonded_props, peptide_bonded_atoms = search_terminal_AA(mol)
|
| 756 |
+
first_atom_index = get_first_atom_index(mol, peptide_bonded_props, peptide_bonded_atoms)
|
| 757 |
+
aa_list = reorder_AAs(mol, first_atom_index)
|
| 758 |
+
split_seq = write_seq(aa_list)
|
| 759 |
+
seq = "".join(split_seq)
|
| 760 |
+
|
| 761 |
+
error = filter_out(split_seq, mol, peptide_bonds)
|
| 762 |
+
|
| 763 |
+
if error:
|
| 764 |
+
seq = ""
|
| 765 |
+
|
| 766 |
+
except Exception as e:
|
| 767 |
+
error = str(e) # Ensure error is a string
|
| 768 |
+
seq = ""
|
| 769 |
+
|
| 770 |
+
# Collect data in a list of dictionaries
|
| 771 |
+
local_mol_data.append({'ID': mol_index, 'SEQUENCE': seq, 'ERROR': error, 'BOND HIGHLIGHTS': bond_highlights})
|
| 772 |
+
|
| 773 |
+
return pd.DataFrame(local_mol_data)
|
| 774 |
+
|
| 775 |
+
def relabel_batches(mol_df, NNAA_df, batch_size):
|
| 776 |
+
# Check if NNAA_df is empty
|
| 777 |
+
if NNAA_df.empty:
|
| 778 |
+
print("Warning: NNAA_df is empty. No NNAAs to process.")
|
| 779 |
+
|
| 780 |
+
# Ensure NNAA_df has an index
|
| 781 |
+
if NNAA_df.index.empty:
|
| 782 |
+
NNAA_df = NNAA_df.reset_index(drop=True)
|
| 783 |
+
mol_df['BOND HIGHLIGHTS'] = ""
|
| 784 |
+
mol_df_copy = mol_df[mol_df['MOL'] != ""].copy()
|
| 785 |
+
indices = list(mol_df_copy.index)
|
| 786 |
+
|
| 787 |
+
def process_batch(batch_indices):
|
| 788 |
+
batch_df = mol_df_copy.loc[batch_indices]
|
| 789 |
+
return relabel_batch(batch_df, NNAA_df)
|
| 790 |
+
|
| 791 |
+
with ThreadPoolExecutor() as executor:
|
| 792 |
+
futures = []
|
| 793 |
+
for i in range(0, len(indices), batch_size):
|
| 794 |
+
batch_indices = indices[i:i + batch_size]
|
| 795 |
+
futures.append(executor.submit(process_batch, batch_indices))
|
| 796 |
+
|
| 797 |
+
local_mol_df = mol_df.copy()
|
| 798 |
+
|
| 799 |
+
for future in tqdm(as_completed(futures), total=len(futures), desc="4/4 Relabelling mols"):
|
| 800 |
+
mol_dataset_per_batch = future.result()
|
| 801 |
+
|
| 802 |
+
for _, row in mol_dataset_per_batch.iterrows():
|
| 803 |
+
local_mol_df.loc[local_mol_df['ID'] == row['ID'], ['SEQUENCE', 'ERROR', 'BOND HIGHLIGHTS']] = row[['SEQUENCE', 'ERROR', 'BOND HIGHLIGHTS']].values
|
| 804 |
+
|
| 805 |
+
return local_mol_df
|
| 806 |
+
|
| 807 |
+
|
| 808 |
+
|
| 809 |
+
def output_NNAA(NNAA_df, output_dir):
|
| 810 |
+
# Drop the 'MOL' column
|
| 811 |
+
NNAA_df = NNAA_df.drop(columns=['MOL'])
|
| 812 |
+
NNAA_df['TAUTOMERS'] = None
|
| 813 |
+
|
| 814 |
+
# Add 'COUNT' by 'TAUTOMER HASH' group and deduplicate by 'TAUTOMER HASH'
|
| 815 |
+
NNAA_df = NNAA_df.groupby('TAUTOMER HASH').agg(
|
| 816 |
+
ID=('ID', 'first'),
|
| 817 |
+
SMILES=('SMILES', 'first'),
|
| 818 |
+
TAUTOMERS=('SMILES', lambda x: ','.join(x.unique())),
|
| 819 |
+
TERMINAL=('TERMINAL', 'first'),
|
| 820 |
+
BOND_SITES=('BOND SITES', 'first'),
|
| 821 |
+
).reset_index().drop_duplicates(subset='TAUTOMER HASH', keep='first')
|
| 822 |
+
|
| 823 |
+
NNAA_df = NNAA_df.drop(columns=['TAUTOMER HASH'])
|
| 824 |
+
|
| 825 |
+
print(output_dir)
|
| 826 |
+
|
| 827 |
+
NNAA_df.to_csv(os.path.join(output_dir, "raw/ncAAs_raw.txt"), sep='\t', index=False)
|
| 828 |
+
|
| 829 |
+
|
| 830 |
+
def output_mols(mol_df, output_dir, draw):
|
| 831 |
+
if draw:
|
| 832 |
+
drawer = MoleculeDrawer(output_dir)
|
| 833 |
+
|
| 834 |
+
def safe_draw(row):
|
| 835 |
+
try:
|
| 836 |
+
drawer.draw_input_mol(row['MOL'], row['ID'], row['SEQUENCE'], row['BOND HIGHLIGHTS'])
|
| 837 |
+
except Exception as e:
|
| 838 |
+
return None # Return None to effectively ignore this row
|
| 839 |
+
|
| 840 |
+
# Apply the safe drawing function to each row
|
| 841 |
+
mol_df.apply(lambda row: safe_draw(row), axis=1)
|
| 842 |
+
|
| 843 |
+
mol_df.drop(columns=['MOL', 'PEPTIDE BONDS'], inplace=True)
|
| 844 |
+
|
| 845 |
+
# bring 'SEQUENCE' column next to 'ID'
|
| 846 |
+
cols = ['ID', 'SEQUENCE'] + [col for col in mol_df.columns if col not in ['ID', 'SEQUENCE']]
|
| 847 |
+
mol_df = mol_df[cols]
|
| 848 |
+
|
| 849 |
+
mol_df.to_csv(os.path.join(output_dir, "raw/sequences_raw.txt"), sep='\t', index=False)
|
| 850 |
+
|
| 851 |
+
def get_rdkit_tautomer_hash(smi):
|
| 852 |
+
mol = Chem.MolFromSmiles(smi)
|
| 853 |
+
if mol is None:
|
| 854 |
+
return None
|
| 855 |
+
layers = RegistrationHash.GetMolLayers(mol)
|
| 856 |
+
return layers[HashLayer.TAUTOMER_HASH]
|
| 857 |
+
|
| 858 |
+
def main():
|
| 859 |
+
mol_df = load_data(input_file)
|
| 860 |
+
NNAA_df, mol_df = label_molecules_in_batches(mol_df, batch_size, smi2mol, ignore_cyclic_peptide, min_amino_acids, max_workers)
|
| 861 |
+
NNAA_df['TAUTOMER HASH'] = NNAA_df['SMILES'].apply(get_rdkit_tautomer_hash)
|
| 862 |
+
NNAA_df = NNAAs_with_OH_removed(NNAA_df)
|
| 863 |
+
NNAA_df = add_IDs(NNAA_df)
|
| 864 |
+
mol_df = relabel_batches(mol_df, NNAA_df, batch_size)
|
| 865 |
+
output_NNAA(NNAA_df, output_dir)
|
| 866 |
+
output_mols(mol_df, output_dir, draw)
|
| 867 |
+
|
| 868 |
+
if __name__ == '__main__':
|
| 869 |
+
args = parse_arguments()
|
| 870 |
+
|
| 871 |
+
input_file = args.input_file
|
| 872 |
+
ignore_cyclic_peptide = not args.process_cyclic
|
| 873 |
+
min_amino_acids = args.min_amino_acids
|
| 874 |
+
batch_size = args.batch_size
|
| 875 |
+
output_dir = args.output_dir
|
| 876 |
+
max_workers = args.max_workers
|
| 877 |
+
draw = args.draw
|
| 878 |
+
|
| 879 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 880 |
+
os.makedirs(os.path.join(output_dir, "raw"), exist_ok=True)
|
| 881 |
+
|
| 882 |
+
main()
|
src/prepare_GPepT_data.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import argparse
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
# Set up argument parser
|
| 7 |
+
parser = argparse.ArgumentParser(description="Process sequences from an input file and split them into two output files.")
|
| 8 |
+
parser.add_argument('--output_dir', type=str, default='output/tmp', help="Directory containing the input file")
|
| 9 |
+
args = parser.parse_args()
|
| 10 |
+
|
| 11 |
+
# Define input file and output file paths
|
| 12 |
+
input_file = os.path.join(args.output_dir, 'standard/sequences_standardized.txt')
|
| 13 |
+
os.makedirs(os.path.join(args.output_dir, 'for_GPepT'), exist_ok=True)
|
| 14 |
+
output_file_90 = os.path.join(args.output_dir, 'for_GPepT/train90.txt')
|
| 15 |
+
output_file_10 = os.path.join(args.output_dir, 'for_GPepT/val10.txt')
|
| 16 |
+
|
| 17 |
+
# Check if the input file exists
|
| 18 |
+
if not os.path.exists(input_file):
|
| 19 |
+
# No ncAAs?
|
| 20 |
+
input_file = os.path.join(args.output_dir, 'raw/sequences_raw.txt')
|
| 21 |
+
if not os.path.exists(input_file):
|
| 22 |
+
print(f"Error: The input file '{input_file}' does not exist.")
|
| 23 |
+
exit(1)
|
| 24 |
+
|
| 25 |
+
# Read the input file into a pandas DataFrame
|
| 26 |
+
df = pd.read_csv(input_file, sep='\t')
|
| 27 |
+
|
| 28 |
+
# Extract sequences and add <endoftext> to each
|
| 29 |
+
sequences = df['SEQUENCE'].apply(lambda x: x + '<|endoftext|>')
|
| 30 |
+
|
| 31 |
+
# Shuffle the sequences to randomize the split
|
| 32 |
+
sequences = sequences.sample(frac=1, random_state=42).reset_index(drop=True)
|
| 33 |
+
|
| 34 |
+
# Split the sequences into 90% and 10%
|
| 35 |
+
split_index = int(0.9 * len(sequences))
|
| 36 |
+
sequences_90 = sequences[:split_index]
|
| 37 |
+
sequences_10 = sequences[split_index:]
|
| 38 |
+
|
| 39 |
+
# Write the sequences to the output files
|
| 40 |
+
sequences_90.to_csv(output_file_90, index=False, header=False)
|
| 41 |
+
sequences_10.to_csv(output_file_10, index=False, header=False)
|
| 42 |
+
|
| 43 |
+
print(f"Data has been successfully split into {output_file_90} and {output_file_10}")
|
src/standardizer.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import re
|
| 5 |
+
import argparse
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
def parse_arguments():
|
| 9 |
+
parser = argparse.ArgumentParser(description="Standardize non-canonical amino acids (ncAAs) and sequences.")
|
| 10 |
+
parser.add_argument("--output_dir", default='output/tmp', help="Directory to save output files.")
|
| 11 |
+
return parser.parse_args()
|
| 12 |
+
|
| 13 |
+
def main():
|
| 14 |
+
args = parse_arguments()
|
| 15 |
+
|
| 16 |
+
output_dir = args.output_dir
|
| 17 |
+
|
| 18 |
+
# Ensure output directory exists
|
| 19 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 20 |
+
os.makedirs(os.path.join(output_dir, 'standard'), exist_ok=True)
|
| 21 |
+
|
| 22 |
+
# Paths for input files
|
| 23 |
+
standard_ncAAs_file = 'dictionary.txt'
|
| 24 |
+
raw_ncAAs_file = os.path.join(output_dir, 'raw/ncAAs_raw.txt')
|
| 25 |
+
sequence_file = os.path.join(output_dir, 'raw/sequences_raw.txt')
|
| 26 |
+
|
| 27 |
+
# Paths for output files
|
| 28 |
+
id_mapping_output = os.path.join(output_dir, 'nc_raw2standard.txt')
|
| 29 |
+
relabeled_ncAAs_output = os.path.join(output_dir, 'standard/nc_standardized.txt')
|
| 30 |
+
relabeled_sequence_output = os.path.join(output_dir, 'standard/sequences_standardized.txt')
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
|
| 34 |
+
# Load the analysis DataFrame
|
| 35 |
+
standard_ncAAs = pd.read_csv(standard_ncAAs_file, sep='\t')
|
| 36 |
+
raw_ncAAs = pd.read_csv(raw_ncAAs_file, sep='\t')
|
| 37 |
+
|
| 38 |
+
# Remove rows whose 'ID' does not start with 'X'
|
| 39 |
+
raw_ncAAs = raw_ncAAs[raw_ncAAs['ID'].str.startswith('X')]
|
| 40 |
+
|
| 41 |
+
# Dictionary to store old and new IDs
|
| 42 |
+
id_map = {}
|
| 43 |
+
|
| 44 |
+
# Function to relabel IDs of the current_ncAAs DataFrame according to the standard_ncAAs DataFrame ID with the same SMILES
|
| 45 |
+
def relabel_id(row):
|
| 46 |
+
old_id = row['ID']
|
| 47 |
+
# Find the row in standard_ncAAs with the same SMILES
|
| 48 |
+
match = standard_ncAAs[standard_ncAAs['SMILES'] == row['SMILES']]
|
| 49 |
+
if not match.empty:
|
| 50 |
+
new_id = match['ID'].values[0]
|
| 51 |
+
id_map[old_id] = new_id # Record old and new ID mapping
|
| 52 |
+
return new_id
|
| 53 |
+
else:
|
| 54 |
+
return "[UNK]"
|
| 55 |
+
|
| 56 |
+
# Apply the function to relabel IDs and store old-new ID mappings
|
| 57 |
+
raw_ncAAs['ID'] = raw_ncAAs.apply(relabel_id, axis=1)
|
| 58 |
+
|
| 59 |
+
# Save the ID mapping
|
| 60 |
+
id_map_df = pd.DataFrame(list(id_map.items()), columns=['raw_ID', 'standard_ID'])
|
| 61 |
+
id_map_df.to_csv(id_mapping_output, sep='\t', index=False)
|
| 62 |
+
|
| 63 |
+
raw_ncAAs.to_csv(relabeled_ncAAs_output, sep='\t', index=False)
|
| 64 |
+
|
| 65 |
+
# Load the sequence file
|
| 66 |
+
sequence_df = pd.read_csv(sequence_file, sep='\t')
|
| 67 |
+
|
| 68 |
+
# Drop rows whose 'SEQUENCE' is NaN
|
| 69 |
+
sequence_df = sequence_df.dropna(subset=['SEQUENCE'])
|
| 70 |
+
|
| 71 |
+
# Function to apply the relabeling in the SEQUENCE column
|
| 72 |
+
def relabel_sequence(sequence):
|
| 73 |
+
# Split the sequence by capital letters, which separates each ID
|
| 74 |
+
tokens = re.split(r"(?=[A-Z])", sequence)
|
| 75 |
+
# Replace each token if it matches an old ID in the map
|
| 76 |
+
relabeled_tokens = [id_map.get(token, token) for token in tokens]
|
| 77 |
+
# If '[NA]' is in the relabeled tokens, return an empty string
|
| 78 |
+
if '[UNK]' in relabeled_tokens:
|
| 79 |
+
return ''
|
| 80 |
+
# Reassemble the sequence
|
| 81 |
+
return ''.join(relabeled_tokens)
|
| 82 |
+
|
| 83 |
+
# Apply relabeling to each sequence
|
| 84 |
+
sequence_df['SEQUENCE'] = sequence_df['SEQUENCE'].apply(relabel_sequence)
|
| 85 |
+
|
| 86 |
+
# Save the relabeled sequences
|
| 87 |
+
sequence_df.to_csv(relabeled_sequence_output, sep='\t', index=False)
|
| 88 |
+
|
| 89 |
+
print("Relabeling complete.")
|
| 90 |
+
print(f"ID mapping saved to: {id_mapping_output}")
|
| 91 |
+
print(f"Relabeled ncAAs saved to: {relabeled_ncAAs_output}")
|
| 92 |
+
print(f"Relabeled sequences saved to: {relabeled_sequence_output}")
|
| 93 |
+
|
| 94 |
+
except Exception as e:
|
| 95 |
+
print(f"No ncAAs found.")
|
| 96 |
+
|
| 97 |
+
if __name__ == "__main__":
|
| 98 |
+
main()
|