Playingyoyo commited on
Commit
084b58f
·
1 Parent(s): 47e3259

Initial Monomerizer Space

Browse files
GPepT_analysis_pipeline.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import subprocess
4
+ import sys
5
+ import datetime
6
+
7
+ def run_pipeline(sequence_file, output_dir, demonomerized_file, demonomerizer_args=None, analyse_args=None):
8
+ os.makedirs(output_dir, exist_ok=True)
9
+
10
+ # Step 1: Run demonomerizer.py
11
+ print(f"Running demonomerizer.py... Input: {sequence_file}")
12
+ demonomerizer_command = [
13
+ sys.executable, "src/demonomerizer.py",
14
+ "--sequence_file", sequence_file,
15
+ "--NNAA_file", "dictionary.txt",
16
+ "--batch_size", "8",
17
+ "--output_dir", output_dir,
18
+ "--demonomerized_file", demonomerized_file
19
+ ]
20
+
21
+ subprocess.run(demonomerizer_command, check=True)
22
+
23
+ demonomerized_path = os.path.join(output_dir, demonomerized_file)
24
+
25
+ # Step 2: Run analyse.py
26
+ print("Running analyse.py...")
27
+ analyse_command = [
28
+ sys.executable, "src/analyse.py",
29
+ "--mols_file", demonomerized_path,
30
+ "--input_dir", output_dir,
31
+ "--target_type", "peptides",
32
+ ]
33
+ if analyse_args:
34
+ analyse_command.extend(analyse_args)
35
+
36
+ subprocess.run(analyse_command, check=True)
37
+
38
+ if __name__ == "__main__":
39
+ parser = argparse.ArgumentParser(description="Run the demonomerizer pipeline.")
40
+ parser.add_argument("--sequence_file", default="demonomerized.txt", help="Input sequence file")
41
+ parser.add_argument("--output_dir", default=f"output/{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}", help="Directory to store output")
42
+ parser.add_argument("--demonomerized_file", default="sequences_standardized.txt", help="Output demonomerized file name")
43
+ parser.add_argument("--batch_size", type=int, default=8, help="Batch size for demonomerizer.py")
44
+ parser.add_argument("-fetch_names", action="store_true", help="Fetch names from PubChem in analyse.py")
45
+ parser.add_argument("--target_type", default="ncAAs", help="Target type: ncAAs or peptides")
46
+
47
+ args = parser.parse_args()
48
+
49
+ # Args for demonomerizer
50
+ demonomerizer_args = ["--NNAA_file", "dictionary.txt", "--batch_size", str(args.batch_size)]
51
+
52
+ # Args for analyse
53
+ analyse_args = []
54
+ if args.fetch_names:
55
+ analyse_args.append("-fetch_names")
56
+ if args.target_type:
57
+ analyse_args.extend(["--target_type", args.target_type])
58
+
59
+ run_pipeline(args.sequence_file, args.output_dir, args.demonomerized_file, demonomerizer_args, analyse_args)
demo/example.svg ADDED
demo/example_GPepT_generated_sequences.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ SEQUENCE
2
+ X7681VZ81
3
+ X1132RZ0
4
+ X369X2326Z0
5
+ X72AZ4941
6
+ X183PLGPGZ421
7
+ X2954AZ88
8
+ X34X6765X5Z11
9
+ X47WX47LFKKIGAVLKVLZ0
demo/example_smiles.txt ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SMILES
2
+ CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)NO)NC(=O)OCc1ccccc1)C(C)C)[C@@H](O)CC(=O)NCCc1ccccc1
3
+ N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1cc(I)c(O)c(I)c1)C(N)=O
4
+ NC(=O)[C@@H]1C[C@H](NC(=O)C(F)(F)F)CN1C(=O)[C@H](N)CCC/N=C(\N)N[N+](=O)[O-]
5
+ CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CCCNC(=N)N)NC(=O)OCc1ccccc1)[C@@H](O)CC(=O)NC1CCCCC1
6
+ CC(C)C[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](N)Cc1ccccc1)C(=O)NCC(=O)N1CCC[C@H]1C(=O)NCC(=O)Nc1ccc(N(CCCl)CCCl)cc1
7
+ C[C@H](NC(=O)[C@@H](CO)NS(=O)(=O)c1ccccc1)C(=O)N[C@H]1CCCN(C(=N)N)C1O
8
+ C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](c2csc(-c3ccccc3)n2)CN1C(=O)[C@@H](NC(=O)OC1CCCC1)C(C)(C)C)C(=O)NS(=O)(=O)C1CC1
9
+ CC[C@H](C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCC[N+](C)(C)C)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](N)CCCC[N+](C)(C)C)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
10
+ CSCC[C@H](NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](C)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H](N)CC(C)C)[C@@H](C)O)C(=O)O
11
+ CSCC[C@H](NC(=O)[C@H](Cc1cnc[nH]1)NC(=O)[C@H](CO)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H](N)Cc1cnc[nH]1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)O
12
+ CC(C)(C)NC(=O)C1(C2CCCCC2)CCN(C(=O)[C@@H](Cc2ccc(F)cc2)NC(=O)[C@@H]2CNC3(CC3)CN2)CC1
13
+ CC(=O)O[C@H]1C(=O)[C@@]2(C)[C@H]([C@H](OC(=O)c3ccccc3)[C@]3(O)CC(OC(=O)[C@H](OC(=O)NCCNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc4ccccc4)NC(=O)[C@@H](C)N)C(NC(=O)c4ccccc4)c4ccccc4)C(C)=C1C3(C)C)[C@]1(OC(C)=O)CO[C@@H]1C[C@@H]2O
14
+ COC(=O)[C@@H](Cc1ccccc1)NC(=O)[C@H](NC(=O)CC(O)CC(CC(C)C)NC(=O)[C@@H](N)Cc1c[nH]cn1)C(C)C
15
+ CC[C@H](C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](N)CCCC[N+](C)(C)C)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C(C)C)C(O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCC[N+](C)(C)C)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C
16
+ CC1OC(SCCCCCCNC(=O)[C@H](CCC(N)=O)NC(=O)[C@@H](C)N)C(O)C(O)C1O
17
+ CC1(C)N([O])C(c2ccc(OCC(=O)NCCCNC(=O)[C@H](Cc3ccc(O)cc3)NC(=O)[C@@H](N)CCCNC(=N)N)cc2)=[N+]([O-])C1(C)C
18
+ CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CSCCC[P+](C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CSCCC[P+](C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)CSCCC[P+](C)(C)C)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(N)=O
19
+ CCN(CC)CCC(=O)NC(C(=O)N[C@H](Cc1cn(C=O)c2ccccc12)C(=O)N[C@@H](Cc1ccccc1)C(=O)N(C)Cc1ccccc1)C(C)O
20
+ N[C@@H](Cc1cc2ccccc2[nH]1)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)NCCCC(=O)NCC(=O)NCCCCCCOP(=O)(O)Oc1ccccc1Cl
21
+ C[C@@H](N)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCCCN)C(=O)NCCCCC(=O)OCCNc1nc(NCCc2ccccc2)c2cnn(/C=C/c3ccccc3)c2n1
22
+ CCCC[PH](CCCC)(CCCC)Cc1ccc(NC(=O)C2Cc3ccccc3CN2C(=O)[C@@H](N)CCc2ccccc2)cc1
23
+ CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)CCCCN)C(=O)N[C@@H](CCCCN)C(=O)O
24
+ Cc1cc(C(=O)N[C@H](C(=O)N[C@@H](Cc2ccc(F)cc2)C(=O)N[C@@H](/C=C/C(=O)OCc2nc3cc(Cl)ccc3[nH]2)CCC(N)=O)C(C)C)no1
25
+ CCN(CC)c1ccc2c(-c3ccc(S(=O)(=O)NCCCC[C@H](NC(=O)Cc4csc(=N)n4C)C(=O)N[C@@H](Cc4cn(Cc5ccccc5)c[n+]4C)C(=O)NC4CCN(C)CC4)cc3S(=O)(=O)[O-])c3ccc(=[N+](CC)CC)cc-3oc2c1
26
+ CC(C)C[C@H](N)C(=O)NCC(=O)N1CCC[C@H]1C(=O)NCC(=O)Nc1ccc(N(CCCl)CCCl)cc1
27
+ CC1=CC(C)=[N+]2C1=Cc1ccc(CCC(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(N)=O)n1[B-]2(F)F
28
+ N=C(N)NCCC[C@@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](CC(=O)O)NC(=O)[C@@H](CO)NC(=O)[C@@H](N)CC(=O)O)C(=O)O
29
+ CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@H]2CCCN2C[C@H]1C(=O)N[C@@H]1CCOc2ccccc21)C1CCC(F)(F)CC1
30
+ COc1ccc(NC(=O)[C@@H]2Cc3ccc(OCC(=O)NO)cc3CN2C(=O)[C@H](C)N)cc1
31
+ COc1cccc(COc2ccc([C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCNC(=N)N)NC(=O)c3cccs3)C(N)=O)cc2)c1
32
+ CC(C)CC(N)C(=O)NCC(=O)NC[C@H](C)B1OC2CC3CC(C3(C)C)[C@@]2(C)O1
33
+ CCN(CC)CCNC(=O)c1ccc(C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C=O)CC(C)C)c(NNN2CCCC2)c1
34
+ CC(O)C(NC(=O)CCN)C(=O)N[C@H](Cc1cn(C=O)c2ccccc12)C(=O)N[C@@H](Cc1ccccc1)C(=O)N(C)Cc1ccccc1
35
+ CCCN(CC(=O)N[C@H](C=O)CCCN=C(N)N)C(=O)[C@H]1CCCCN1
36
+ CC(C)C[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(C)C)C(C)C)C(=O)N[C@@H](CO)C(=O)O
37
+ CC(C)(N)C(=O)N[C@H](CCCc1ccccc1)C(=O)N1CCC2(CC1)CC(O)c1ccccc12
38
+ CC[C@H](C)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](N)CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CSc1ccc2n1[B-](F)(F)[N+]1=CC=CC1=C2)C(=O)O
39
+ O=C(NCCCCC[C@H](NC(=O)[C@@H]1C[C@@H](N2CCCCC2)CN1C(=O)[C@@H](CC1CCCCC1)NC(=O)c1ccc2ccccc2c1)B(O)O)NC1CCCCC1
40
+ CC(=O)N[C@@H](CCCO/N=C/c1ccccc1)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCC(=O)O)C(N)=O
41
+ COc1ccc(CC(C)(NC(=O)[C@@H]2CCCN2C(=O)CCCc2ccc(O)cc2)C(=O)NCCCN)cc1OC
42
+ N=C(N)N1CCC(C(NS(=O)(=O)Cc2ccccc2)C(=O)NCC(=O)N[C@H]2CCCN(C(=N)N)C2O)CC1
43
+ CCC(=O)NCCOCCOCCNC(=O)/N=C(\N)NCCC[C@H](NC(=O)[C@@H](N)CC(=O)O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1cnc[nH]1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(=O)O)[C@@H](C)CC)C(C)C
44
+ N=C(N)c1ccc(CNC(=O)[C@@H]2CCCN2C(=O)[C@H](N)C2CCCCC2)cc1
45
+ CC(C)C[C@H](NC(=O)CNC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)NCC(N)=O
46
+ CCCCCCCC(=O)OC[C@H](NC(=O)C(C)(C)N)C(=O)N1CCC2(CC1)CN(S(C)(=O)=O)c1ccccc12
47
+ COc1ccc(NC(C)=O)cc1C(=O)NNC(=O)[C@H](CCCCN)NC(=O)CCOC[C@H]1OC(OCCCNC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)OC(C)(C)C)C(C)C)[C@H](O)[C@@H](O)[C@@H]1O
48
+ CSCC[C@H](NC(=O)[C@@H]1Cc2ccccc2CN1)C(=O)NO
49
+ CSCC[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(C)=O)C(=O)NCC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](COS(=O)(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(N)=O
50
+ C=Cc1c(C)c2cc3nc(c4c5[nH]c(cc6nc(cc1[nH]2)C(C)=C6CC)c(C)c5C(=O)C4)[C@@H](CCC(=O)N[C@H](C(=O)N[C@@H](CO)C(=O)NCC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCCCNOC(=O)CCCN(C)c1ccc(/N=N/c2cccc4nc5ccc(N(CC)CC)cc5[n+](-c5ccccc5)c24)cc1)C(N)=O)[C@@H](C)O)[C@@H]3C
51
+ CC(=O)NC(Cc1ccc([N+](=O)[O-])cc1)C(=O)NCC(N)C(=O)c1ccccc1
52
+ CC(C)(C)[C@H](NC(=O)Cc1cc(Cl)cc(Cl)c1)C(=O)NCC(=O)NC/C=C/S(C)(=O)=O
53
+ CC(C)CCOc1ccc2ccccc2c1-c1c(OCC(=O)N[C@H](CCCCN)C(=O)N[C@H](CCCN)C(=O)N[C@@H](CC(C)C)C(=O)OCc2ccccc2)ccc2ccccc12
54
+ CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)c1[nH]c2ccc(OCCCCCN)cc2c1CCCCCCN)C(=O)N[C@@H](CC(C)C)C(=O)O
55
+ CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C)C(C)C
56
+ CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
57
+ CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
58
+ CC[C@H](C)[C@H](NC(=O)[C@@H](N)C(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N1CCC[C@H]1C(=O)N[C@@H](C)C(=O)NCC(=O)O
59
+ CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C
60
+ CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
61
+ CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
62
+ CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
63
+ COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)C[C@@](O)(C(=O)CO)C[C@@H]3O[C@H]1C[C@H]2[C@H](OCN2C(=O)OCc2ccc(NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc3ccccc3)NC(=O)[C@@H](C)N)cc2)[C@H](C)O1
64
+ CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)CC(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
65
+ CC(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(N)=O)C(C)C)C(C)C
66
+ CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)c1[nH]c2c(OCCCCCN=C(N)N)cccc2c1CCCCCCN=C(N)N)C(=O)N[C@@H](CC(C)C)C(=O)O
67
+ CCN(CC(=O)NCC(=O)Nc1cccc(C)c1C)Cc1ccccc1
68
+ CCn1c(-c2nonc2N)nc2c(C#CC(C)(C)O)ncc(OCCCNCCCC(=O)NCCOCCOCCOCCC(=O)N[C@H](C(=O)N3C[C@H](O)C[C@H]3C(=O)NCc3ccc(-c4scnc4C)cc3)C(C)(C)C)c21
69
+ Cc1cc2c(s1)-n1c(C)nnc1[C@H](CC(=O)NCCOCCCOCC(=O)N[C@@H](C(=O)N1C[C@H](O)C[C@H]1C(=O)NCc1ccc(-c3scnc3C)cc1)C(C)(C)C)N=C2c1ccc(Cl)cc1
70
+ NC(=O)[C@@H]1C[C@H](NC(=O)[C@H](N)CCC/N=C(\N)N[N+](=O)[O-])CN1
71
+ CC(C)C[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)C[C@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1ccccc1)C(N)=O
72
+ CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCCNC(=N)N)NC(=O)CC(c1ccccc1)c1ccccc1)[C@@H](C)O)[C@@H](C)CC)C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](Cc1cn(CCN2CCCc3cc(/C=C/C4=C(Br)C(/C=C/c5cc6c7c(c5)CCCN7CCC6)=[O+][B-](F)(C(F)(F)F)O4)ccc32)nn1)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O)[C@@H](C)CC)[C@@H](C)CC
73
+ Cc1ncsc1-c1ccc(C2(NC(=O)[C@@H]3C[C@@H](O)CN3C(=O)[C@@H](NC(=O)[C@H]3CC4(C3)C[C@H](N3CCC(c5cnc(N6C7CCC6CN(c6cc(-c8ccccc8O)nnc6N)C7)nc5)CC3)C4)C(C)(C)C)CC2)cc1
74
+ Cc1cc(C)c(CNC(=O)c2cc(-c3ccc(N4CCN(C(=O)CCCCCn5cc(CCCCCC(=O)N[C@@H](C(=O)N6C[C@H](O)C[C@H]6C(=O)NCc6ccc(-c7scnc7C)cc6)C(C)(C)C)nn5)CC4)nc3)cc3c2cnn3C(C)C)c(=O)[nH]1
75
+ CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(C)=O)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(=O)O)[C@@H](C)CC
76
+ CC[C@H](C)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](C)NC(=O)[C@@H](NC(=O)[C@H](CNC(=O)c1cccc(S(=O)(=O)F)c1)NC(C)=O)[C@@H](C)CC)C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(N)=O
77
+ CC(C)[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](Cc1cccc(Cl)c1)NC(=O)C1CCN(C)CC1)C(=O)C(F)(F)F
78
+ COc1ccc([C@H](NC(=O)[C@H](Cc2cccc(Cl)c2)NC(=O)c2ccc(CNC(=O)OC(C)(C)C)c(Cl)c2)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
79
+ COc1ccc([C@H](NC(=O)[C@H](Cc2cccc(Cl)c2)NC(=O)c2cccc(OCC(=O)OC(C)(C)C)c2)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
80
+ COc1ccc([C@H](NC(=O)[C@H](C)NC(=O)C(c2ccc(Cl)cc2)C(C)C)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
81
+ CC(C)C[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CCCCN)NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CCCN1C(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](Cc1c[nH]cn1)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](N)CC(C)C)C(=O)O
82
+ CSCC[C@H](NC(=O)[C@H](C)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCCCN)C(C)C)[C@@H](C)O)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CS)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](CC(C)C)C(=O)O
83
+ CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](N)CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O
84
+ Cc1oc2c(c(C)cc3oc(=O)c(CC(=O)NCC(=O)NCC(=O)NCC(C)O)c(C)c32)c1C
85
+ CC[C@H](C)[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(N)=O)NC(C)=O)[C@@H](C)CC)[C@@H](C)CC)[C@@H](C)CC)[C@@H](C)CC)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(N)=O
86
+ Cc1cc(Cn2c(N3CC4(CNC4)C3)nc3c(N4CCN(CCCC(=O)NCCCC(=O)N[C@H](C(=O)N5C[C@H](O)C[C@H]5C(=O)N[C@@H](C)c5ccc(-c6scnc6C)cc5)C(C)(C)C)CC4)cc(Cl)cc32)cc(C)c1F
87
+ Cn1ccc(-c2cc(Cl)c(Cl)c3[nH]c4c(c23)CN(C(=O)CNC(=O)CN2CCNCC2)CC4)n1
88
+ CN(C)CCCCCNC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](N)Cc1c[nH]cn1
89
+ CN(Cc1ccc2ccccc2c1)C(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]1CCCN1/C(S)=N/Cc1ccccc1Cl
90
+ CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)CNC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)N[C@@H](CCCN=C(N)N)C(=O)O
91
+ CSCC[C@H](NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CO)NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@H](CCCN=C(N)N)C(=O)O
92
+ O=C(N[C@H](CC1CCCCC1)C(=O)N1C[C@H](N2CCCCC2)C[C@H]1C(=O)N[C@@H](CCCCN1CC2(CSC2)C1)B(O)O)c1ccc2ccccc2c1
93
+ N=C(N)NCCCC[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](Cc1ccccc1)NS(=O)(=O)Cc1ccccc1)C(=O)Cc1ccccc1
94
+ COC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)CCCCC(=O)NC[C@@H]1CCN2CC[C@@H](CO[Si](c3ccccc3)(c3ccccc3)C(C)(C)C)N=C2N1)[C@@H](C)O
95
+ C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](Oc2cc(-c3ccccc3)nc3cc(OC)ccc23)CN1C(=O)[C@@H](NC(=O)OC(C)(C)C)C(C)(C)C)C(=O)NS(=O)(=O)C1CC1
96
+ COc1cc(N2CCN(CCCC(=O)N[C@H](C(=O)N3C[C@H](O)C[C@H]3C(=O)N[C@@H](C)c3ccc(-c4scnc4C)cc3)C(C)(C)C)CC2)ccc1Nc1ncc(Cl)c(Nc2ccccc2P(C)(C)=O)n1
97
+ C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](Oc2cc(-c3ccccc3)nc3cc(OC)ccc23)CN1C(=O)[C@@H](NC(=O)OC(C)(C)C)C(C)(C)C)C(=O)NNS(=O)(=O)c1ccccc1
98
+ CC[C@H](C)[C@H](NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CO)NC(=O)CCNC(=S)Nc1ccc(-c2c3ccc(=O)cc-3oc3cc(O)ccc23)c(C(=O)O)c1)C(=O)N[C@@H](CSCc1ccccc1)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N1CCC[C@H]1C(N)=O)[C@@H](C)O)C(C)C
99
+ CSCC[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](C)NC(=O)[C@H](CO)NC(=O)[C@H](C)N)[C@@H](C)O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CS)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CS)C(=O)NCC(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)O)C(C)C)C(C)C
100
+ CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NP(=O)(O)CCCCN1C(=O)c2ccccc2C1=O)C(=O)NCc1ccccc1
101
+ CC(C)C[C@H](NCC(N)=O)c1cc(F)ccc1N1CCN(C(=O)[C@@H](Cc2ccc(Cl)cc2Cl)N2CCCC2=O)CC1
demo/example_smiles_IDs.txt ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ID SMILES
2
+ CHEMBL3782097 CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)NO)NC(=O)OCc1ccccc1)C(C)C)[C@@H](O)CC(=O)NCCc1ccccc1
3
+ CHEMBL3819704 N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1cc(I)c(O)c(I)c1)C(N)=O
4
+ CHEMBL2368819 NC(=O)[C@@H]1C[C@H](NC(=O)C(F)(F)F)CN1C(=O)[C@H](N)CCC/N=C(\N)N[N+](=O)[O-]
5
+ CHEMBL3545807 CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CCCNC(=N)N)NC(=O)OCc1ccccc1)[C@@H](O)CC(=O)NC1CCCCC1
6
+ CHEMBL3302347 CC(C)C[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](N)Cc1ccccc1)C(=O)NCC(=O)N1CCC[C@H]1C(=O)NCC(=O)Nc1ccc(N(CCCl)CCCl)cc1
7
+ CHEMBL1184757 C[C@H](NC(=O)[C@@H](CO)NS(=O)(=O)c1ccccc1)C(=O)N[C@H]1CCCN(C(=N)N)C1O
8
+ CHEMBL2403897 C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](c2csc(-c3ccccc3)n2)CN1C(=O)[C@@H](NC(=O)OC1CCCC1)C(C)(C)C)C(=O)NS(=O)(=O)C1CC1
9
+ CHEMBL1229044 CC[C@H](C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCC[N+](C)(C)C)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](N)CCCC[N+](C)(C)C)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
10
+ CHEMBL2425403 CSCC[C@H](NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](C)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H](N)CC(C)C)[C@@H](C)O)C(=O)O
11
+ CHEMBL2425396 CSCC[C@H](NC(=O)[C@H](Cc1cnc[nH]1)NC(=O)[C@H](CO)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H](N)Cc1cnc[nH]1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)O
12
+ CHEMBL1181891 CC(C)(C)NC(=O)C1(C2CCCCC2)CCN(C(=O)[C@@H](Cc2ccc(F)cc2)NC(=O)[C@@H]2CNC3(CC3)CN2)CC1
13
+ CHEMBL1185696 CC(=O)O[C@H]1C(=O)[C@@]2(C)[C@H]([C@H](OC(=O)c3ccccc3)[C@]3(O)CC(OC(=O)[C@H](OC(=O)NCCNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc4ccccc4)NC(=O)[C@@H](C)N)C(NC(=O)c4ccccc4)c4ccccc4)C(C)=C1C3(C)C)[C@]1(OC(C)=O)CO[C@@H]1C[C@@H]2O
14
+ CHEMBL1189783 COC(=O)[C@@H](Cc1ccccc1)NC(=O)[C@H](NC(=O)CC(O)CC(CC(C)C)NC(=O)[C@@H](N)Cc1c[nH]cn1)C(C)C
15
+ CHEMBL1229047 CC[C@H](C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](N)CCCC[N+](C)(C)C)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C(C)C)C(O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCC[N+](C)(C)C)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C
16
+ CHEMBL418285 CC1OC(SCCCCCCNC(=O)[C@H](CCC(N)=O)NC(=O)[C@@H](C)N)C(O)C(O)C1O
17
+ CHEMBL3787168 CC1(C)N([O])C(c2ccc(OCC(=O)NCCCNC(=O)[C@H](Cc3ccc(O)cc3)NC(=O)[C@@H](N)CCCNC(=N)N)cc2)=[N+]([O-])C1(C)C
18
+ CHEMBL4302812 CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CSCCC[P+](C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CSCCC[P+](C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)CSCCC[P+](C)(C)C)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(N)=O
19
+ CHEMBL1195733 CCN(CC)CCC(=O)NC(C(=O)N[C@H](Cc1cn(C=O)c2ccccc12)C(=O)N[C@@H](Cc1ccccc1)C(=O)N(C)Cc1ccccc1)C(C)O
20
+ CHEMBL1179530 N[C@@H](Cc1cc2ccccc2[nH]1)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)NCCCC(=O)NCC(=O)NCCCCCCOP(=O)(O)Oc1ccccc1Cl
21
+ CHEMBL5029048 C[C@@H](N)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCCCN)C(=O)NCCCCC(=O)OCCNc1nc(NCCc2ccccc2)c2cnn(/C=C/c3ccccc3)c2n1
22
+ CHEMBL1199463 CCCC[PH](CCCC)(CCCC)Cc1ccc(NC(=O)C2Cc3ccccc3CN2C(=O)[C@@H](N)CCc2ccccc2)cc1
23
+ CHEMBL2103901 CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)CCCCN)C(=O)N[C@@H](CCCCN)C(=O)O
24
+ CHEMBL2165218 Cc1cc(C(=O)N[C@H](C(=O)N[C@@H](Cc2ccc(F)cc2)C(=O)N[C@@H](/C=C/C(=O)OCc2nc3cc(Cl)ccc3[nH]2)CCC(N)=O)C(C)C)no1
25
+ CHEMBL4300381 CCN(CC)c1ccc2c(-c3ccc(S(=O)(=O)NCCCC[C@H](NC(=O)Cc4csc(=N)n4C)C(=O)N[C@@H](Cc4cn(Cc5ccccc5)c[n+]4C)C(=O)NC4CCN(C)CC4)cc3S(=O)(=O)[O-])c3ccc(=[N+](CC)CC)cc-3oc2c1
26
+ CHEMBL3302723 CC(C)C[C@H](N)C(=O)NCC(=O)N1CCC[C@H]1C(=O)NCC(=O)Nc1ccc(N(CCCl)CCCl)cc1
27
+ CHEMBL4301870 CC1=CC(C)=[N+]2C1=Cc1ccc(CCC(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(N)=O)n1[B-]2(F)F
28
+ CHEMBL2304033 N=C(N)NCCC[C@@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](CC(=O)O)NC(=O)[C@@H](CO)NC(=O)[C@@H](N)CC(=O)O)C(=O)O
29
+ CHEMBL2364835 CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@H]2CCCN2C[C@H]1C(=O)N[C@@H]1CCOc2ccccc21)C1CCC(F)(F)CC1
30
+ CHEMBL1852804 COc1ccc(NC(=O)[C@@H]2Cc3ccc(OCC(=O)NO)cc3CN2C(=O)[C@H](C)N)cc1
31
+ CHEMBL3740745 COc1cccc(COc2ccc([C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCNC(=N)N)NC(=O)c3cccs3)C(N)=O)cc2)c1
32
+ CHEMBL5315308 CC(C)CC(N)C(=O)NCC(=O)NC[C@H](C)B1OC2CC3CC(C3(C)C)[C@@]2(C)O1
33
+ CHEMBL1188598 CCN(CC)CCNC(=O)c1ccc(C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C=O)CC(C)C)c(NNN2CCCC2)c1
34
+ CHEMBL1189941 CC(O)C(NC(=O)CCN)C(=O)N[C@H](Cc1cn(C=O)c2ccccc12)C(=O)N[C@@H](Cc1ccccc1)C(=O)N(C)Cc1ccccc1
35
+ CHEMBL1191337 CCCN(CC(=O)N[C@H](C=O)CCCN=C(N)N)C(=O)[C@H]1CCCCN1
36
+ CHEMBL3407793 CC(C)C[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(C)C)C(C)C)C(=O)N[C@@H](CO)C(=O)O
37
+ CHEMBL1193469 CC(C)(N)C(=O)N[C@H](CCCc1ccccc1)C(=O)N1CCC2(CC1)CC(O)c1ccccc12
38
+ CHEMBL3408302 CC[C@H](C)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](N)CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CSc1ccc2n1[B-](F)(F)[N+]1=CC=CC1=C2)C(=O)O
39
+ CHEMBL4597997 O=C(NCCCCC[C@H](NC(=O)[C@@H]1C[C@@H](N2CCCCC2)CN1C(=O)[C@@H](CC1CCCCC1)NC(=O)c1ccc2ccccc2c1)B(O)O)NC1CCCCC1
40
+ CHEMBL3410386 CC(=O)N[C@@H](CCCO/N=C/c1ccccc1)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCC(=O)O)C(N)=O
41
+ CHEMBL1196235 COc1ccc(CC(C)(NC(=O)[C@@H]2CCCN2C(=O)CCCc2ccc(O)cc2)C(=O)NCCCN)cc1OC
42
+ CHEMBL1181305 N=C(N)N1CCC(C(NS(=O)(=O)Cc2ccccc2)C(=O)NCC(=O)N[C@H]2CCCN(C(=N)N)C2O)CC1
43
+ CHEMBL3787701 CCC(=O)NCCOCCOCCNC(=O)/N=C(\N)NCCC[C@H](NC(=O)[C@@H](N)CC(=O)O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1cnc[nH]1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(=O)O)[C@@H](C)CC)C(C)C
44
+ CHEMBL1198214 N=C(N)c1ccc(CNC(=O)[C@@H]2CCCN2C(=O)[C@H](N)C2CCCCC2)cc1
45
+ CHEMBL3304520 CC(C)C[C@H](NC(=O)CNC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)NCC(N)=O
46
+ CHEMBL1179088 CCCCCCCC(=O)OC[C@H](NC(=O)C(C)(C)N)C(=O)N1CCC2(CC1)CN(S(C)(=O)=O)c1ccccc12
47
+ CHEMBL3794663 COc1ccc(NC(C)=O)cc1C(=O)NNC(=O)[C@H](CCCCN)NC(=O)CCOC[C@H]1OC(OCCCNC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)OC(C)(C)C)C(C)C)[C@H](O)[C@@H](O)[C@@H]1O
48
+ CHEMBL1852000 CSCC[C@H](NC(=O)[C@@H]1Cc2ccccc2CN1)C(=O)NO
49
+ CHEMBL1207289 CSCC[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(C)=O)C(=O)NCC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](COS(=O)(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(N)=O
50
+ CHEMBL525036 C=Cc1c(C)c2cc3nc(c4c5[nH]c(cc6nc(cc1[nH]2)C(C)=C6CC)c(C)c5C(=O)C4)[C@@H](CCC(=O)N[C@H](C(=O)N[C@@H](CO)C(=O)NCC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCCCNOC(=O)CCCN(C)c1ccc(/N=N/c2cccc4nc5ccc(N(CC)CC)cc5[n+](-c5ccccc5)c24)cc1)C(N)=O)[C@@H](C)O)[C@@H]3C
51
+ CHEMBL2361923 CC(=O)NC(Cc1ccc([N+](=O)[O-])cc1)C(=O)NCC(N)C(=O)c1ccccc1
52
+ CHEMBL3354497 CC(C)(C)[C@H](NC(=O)Cc1cc(Cl)cc(Cl)c1)C(=O)NCC(=O)NC/C=C/S(C)(=O)=O
53
+ CHEMBL1199003 CC(C)CCOc1ccc2ccccc2c1-c1c(OCC(=O)N[C@H](CCCCN)C(=O)N[C@H](CCCN)C(=O)N[C@@H](CC(C)C)C(=O)OCc2ccccc2)ccc2ccccc12
54
+ CHEMBL1183069 CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)c1[nH]c2ccc(OCCCCCN)cc2c1CCCCCCN)C(=O)N[C@@H](CC(C)C)C(=O)O
55
+ CHEMBL3946803 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C)C(C)C
56
+ CHEMBL3985737 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
57
+ CHEMBL3984334 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
58
+ CHEMBL284201 CC[C@H](C)[C@H](NC(=O)[C@@H](N)C(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N1CCC[C@H]1C(=O)N[C@@H](C)C(=O)NCC(=O)O
59
+ CHEMBL3890815 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C
60
+ CHEMBL3944455 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
61
+ CHEMBL3890020 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
62
+ CHEMBL3891294 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
63
+ CHEMBL2219891 COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)C[C@@](O)(C(=O)CO)C[C@@H]3O[C@H]1C[C@H]2[C@H](OCN2C(=O)OCc2ccc(NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc3ccccc3)NC(=O)[C@@H](C)N)cc2)[C@H](C)O1
64
+ CHEMBL3983321 CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)CC(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
65
+ CHEMBL3914919 CC(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(N)=O)C(C)C)C(C)C
66
+ CHEMBL1178333 CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)c1[nH]c2c(OCCCCCN=C(N)N)cccc2c1CCCCCCN=C(N)N)C(=O)N[C@@H](CC(C)C)C(=O)O
67
+ CHEMBL1463226 CCN(CC(=O)NCC(=O)Nc1cccc(C)c1C)Cc1ccccc1
68
+ CHEMBL5085501 CCn1c(-c2nonc2N)nc2c(C#CC(C)(C)O)ncc(OCCCNCCCC(=O)NCCOCCOCCOCCC(=O)N[C@H](C(=O)N3C[C@H](O)C[C@H]3C(=O)NCc3ccc(-c4scnc4C)cc3)C(C)(C)C)c21
69
+ CHEMBL5286315 Cc1cc2c(s1)-n1c(C)nnc1[C@H](CC(=O)NCCOCCCOCC(=O)N[C@@H](C(=O)N1C[C@H](O)C[C@H]1C(=O)NCc1ccc(-c3scnc3C)cc1)C(C)(C)C)N=C2c1ccc(Cl)cc1
70
+ CHEMBL2368817 NC(=O)[C@@H]1C[C@H](NC(=O)[C@H](N)CCC/N=C(\N)N[N+](=O)[O-])CN1
71
+ CHEMBL5071325 CC(C)C[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)C[C@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1ccccc1)C(N)=O
72
+ CHEMBL5285634 CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCCNC(=N)N)NC(=O)CC(c1ccccc1)c1ccccc1)[C@@H](C)O)[C@@H](C)CC)C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](Cc1cn(CCN2CCCc3cc(/C=C/C4=C(Br)C(/C=C/c5cc6c7c(c5)CCCN7CCC6)=[O+][B-](F)(C(F)(F)F)O4)ccc32)nn1)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O)[C@@H](C)CC)[C@@H](C)CC
73
+ CHEMBL5185804 Cc1ncsc1-c1ccc(C2(NC(=O)[C@@H]3C[C@@H](O)CN3C(=O)[C@@H](NC(=O)[C@H]3CC4(C3)C[C@H](N3CCC(c5cnc(N6C7CCC6CN(c6cc(-c8ccccc8O)nnc6N)C7)nc5)CC3)C4)C(C)(C)C)CC2)cc1
74
+ CHEMBL5202298 Cc1cc(C)c(CNC(=O)c2cc(-c3ccc(N4CCN(C(=O)CCCCCn5cc(CCCCCC(=O)N[C@@H](C(=O)N6C[C@H](O)C[C@H]6C(=O)NCc6ccc(-c7scnc7C)cc6)C(C)(C)C)nn5)CC4)nc3)cc3c2cnn3C(C)C)c(=O)[nH]1
75
+ CHEMBL5090130 CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(C)=O)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(=O)O)[C@@H](C)CC
76
+ CHEMBL5091695 CC[C@H](C)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](C)NC(=O)[C@@H](NC(=O)[C@H](CNC(=O)c1cccc(S(=O)(=O)F)c1)NC(C)=O)[C@@H](C)CC)C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(N)=O
77
+ CHEMBL5090609 CC(C)[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](Cc1cccc(Cl)c1)NC(=O)C1CCN(C)CC1)C(=O)C(F)(F)F
78
+ CHEMBL5073350 COc1ccc([C@H](NC(=O)[C@H](Cc2cccc(Cl)c2)NC(=O)c2ccc(CNC(=O)OC(C)(C)C)c(Cl)c2)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
79
+ CHEMBL5089534 COc1ccc([C@H](NC(=O)[C@H](Cc2cccc(Cl)c2)NC(=O)c2cccc(OCC(=O)OC(C)(C)C)c2)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
80
+ CHEMBL5088663 COc1ccc([C@H](NC(=O)[C@H](C)NC(=O)C(c2ccc(Cl)cc2)C(C)C)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
81
+ CHEMBL5195766 CC(C)C[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CCCCN)NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CCCN1C(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](Cc1c[nH]cn1)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](N)CC(C)C)C(=O)O
82
+ CHEMBL5077064 CSCC[C@H](NC(=O)[C@H](C)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCCCN)C(C)C)[C@@H](C)O)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CS)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](CC(C)C)C(=O)O
83
+ CHEMBL1766929 CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](N)CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O
84
+ CHEMBL1564198 Cc1oc2c(c(C)cc3oc(=O)c(CC(=O)NCC(=O)NCC(=O)NCC(C)O)c(C)c32)c1C
85
+ CHEMBL5198087 CC[C@H](C)[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(N)=O)NC(C)=O)[C@@H](C)CC)[C@@H](C)CC)[C@@H](C)CC)[C@@H](C)CC)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(N)=O
86
+ CHEMBL5208458 Cc1cc(Cn2c(N3CC4(CNC4)C3)nc3c(N4CCN(CCCC(=O)NCCCC(=O)N[C@H](C(=O)N5C[C@H](O)C[C@H]5C(=O)N[C@@H](C)c5ccc(-c6scnc6C)cc5)C(C)(C)C)CC4)cc(Cl)cc32)cc(C)c1F
87
+ CHEMBL5075875 Cn1ccc(-c2cc(Cl)c(Cl)c3[nH]c4c(c23)CN(C(=O)CNC(=O)CN2CCNCC2)CC4)n1
88
+ CHEMBL1767019 CN(C)CCCCCNC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](N)Cc1c[nH]cn1
89
+ CHEMBL323044 CN(Cc1ccc2ccccc2c1)C(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]1CCCN1/C(S)=N/Cc1ccccc1Cl
90
+ CHEMBL63188 CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)CNC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)N[C@@H](CCCN=C(N)N)C(=O)O
91
+ CHEMBL3138731 CSCC[C@H](NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CO)NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@H](CCCN=C(N)N)C(=O)O
92
+ CHEMBL4596927 O=C(N[C@H](CC1CCCCC1)C(=O)N1C[C@H](N2CCCCC2)C[C@H]1C(=O)N[C@@H](CCCCN1CC2(CSC2)C1)B(O)O)c1ccc2ccccc2c1
93
+ CHEMBL1797525 N=C(N)NCCCC[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](Cc1ccccc1)NS(=O)(=O)Cc1ccccc1)C(=O)Cc1ccccc1
94
+ CHEMBL2068547 COC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)CCCCC(=O)NC[C@@H]1CCN2CC[C@@H](CO[Si](c3ccccc3)(c3ccccc3)C(C)(C)C)N=C2N1)[C@@H](C)O
95
+ CHEMBL414993 C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](Oc2cc(-c3ccccc3)nc3cc(OC)ccc23)CN1C(=O)[C@@H](NC(=O)OC(C)(C)C)C(C)(C)C)C(=O)NS(=O)(=O)C1CC1
96
+ CHEMBL5078877 COc1cc(N2CCN(CCCC(=O)N[C@H](C(=O)N3C[C@H](O)C[C@H]3C(=O)N[C@@H](C)c3ccc(-c4scnc4C)cc3)C(C)(C)C)CC2)ccc1Nc1ncc(Cl)c(Nc2ccccc2P(C)(C)=O)n1
97
+ CHEMBL414992 C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](Oc2cc(-c3ccccc3)nc3cc(OC)ccc23)CN1C(=O)[C@@H](NC(=O)OC(C)(C)C)C(C)(C)C)C(=O)NNS(=O)(=O)c1ccccc1
98
+ CHEMBL5281856 CC[C@H](C)[C@H](NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CO)NC(=O)CCNC(=S)Nc1ccc(-c2c3ccc(=O)cc-3oc3cc(O)ccc23)c(C(=O)O)c1)C(=O)N[C@@H](CSCc1ccccc1)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N1CCC[C@H]1C(N)=O)[C@@H](C)O)C(C)C
99
+ CHEMBL5094988 CSCC[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](C)NC(=O)[C@H](CO)NC(=O)[C@H](C)N)[C@@H](C)O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CS)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CS)C(=O)NCC(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)O)C(C)C)C(C)C
100
+ CHEMBL419395 CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NP(=O)(O)CCCCN1C(=O)c2ccccc2C1=O)C(=O)NCc1ccccc1
101
+ CHEMBL393789 CC(C)C[C@H](NCC(N)=O)c1cc(F)ccc1N1CCN(C(=O)[C@@H](Cc2ccc(Cl)cc2Cl)N2CCCC2=O)CC1
dictionary.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas==2.2.2
2
+ rdkit-pypi==2022.9.5
3
+ tqdm==4.67.1
4
+ argparse==1.4.0
5
+ matplotlib==3.8.0
6
+ gradio>=5.0.0
7
+ cairosvg
run_pipeline.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import subprocess
4
+ import sys
5
+ import datetime
6
+
7
+ def run_pipeline(input_file, output_dir, monomerizer_args=None):
8
+ # Ensure output directory exists
9
+ os.makedirs(output_dir, exist_ok=True)
10
+
11
+ # Step 1: Run monomerizer.py with its arguments
12
+ print(f"Running monomerizer.py... Input: {input_file}, Output: {output_dir}")
13
+ monomerizer_command = [sys.executable, "src/monomerizer.py", "--input_file", input_file, "--output_dir", output_dir]
14
+ if monomerizer_args:
15
+ monomerizer_command.extend(monomerizer_args)
16
+ subprocess.run(monomerizer_command, check=True)
17
+
18
+ # Step 2: Run standardizer.py with its arguments
19
+ print("Running standardizer.py...")
20
+ standardizer_command = [sys.executable, "src/standardizer.py", "--output_dir", output_dir]
21
+ subprocess.run(standardizer_command, check=True)
22
+
23
+ # Step 3: Run prepare_GPepT_data.py to process sequences
24
+ print("Running prepare_GPepT_data.py...")
25
+ prepare_gpept_data_command = [sys.executable, "src/prepare_GPepT_data.py", "--output_dir", output_dir]
26
+ subprocess.run(prepare_gpept_data_command, check=True)
27
+
28
+
29
+ if __name__ == "__main__":
30
+ parser = argparse.ArgumentParser(description="Run a pipeline of programs sequentially.")
31
+
32
+ # Add arguments
33
+ parser.add_argument("--input_file", default="demo/example_smiles.txt", help="Input file for the pipeline")
34
+ parser.add_argument("--output_dir", default=f"output/{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}", help="Output directory")
35
+ parser.add_argument("--process_cyclic", action="store_true", help="Process cyclic compounds")
36
+ parser.add_argument("--min_amino_acids", type=int, help="Minimum number of amino acids required")
37
+ parser.add_argument("--batch_size", type=int, help="Batch size for processing")
38
+ parser.add_argument("--max_workers", type=int, help="Maximum number of workers for parallel processing")
39
+ parser.add_argument("-draw", action="store_true", help="Draw the molecules")
40
+
41
+ args = parser.parse_args()
42
+
43
+ # Prepare extra arguments for monomerizer.py
44
+ monomerizer_args = []
45
+ if args.process_cyclic:
46
+ monomerizer_args.append("-process_cyclic")
47
+ if args.min_amino_acids:
48
+ monomerizer_args.extend(["--min_amino_acids", int(args.min_amino_acids)])
49
+ if args.batch_size:
50
+ monomerizer_args.extend(["--batch_size", str(args.batch_size)])
51
+ if args.max_workers:
52
+ monomerizer_args.extend(["--max_workers", str(args.max_workers)])
53
+ if args.draw:
54
+ monomerizer_args.append("-draw")
55
+
56
+ # Run the pipeline
57
+ run_pipeline(args.input_file, args.output_dir, monomerizer_args=monomerizer_args)
src/analyse.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import pandas as pd
4
+ from rdkit import Chem
5
+ from rdkit.Chem import rdMolDescriptors, DataStructs, Descriptors
6
+ import os, sys, requests, tqdm, re, argparse
7
+ from collections import defaultdict
8
+ import xml.etree.ElementTree as ET
9
+
10
+ def add_canonical_smiles(df):
11
+ canonical_smiles_list = [
12
+ "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N", # Trytophan (W)
13
+ "C(C[C@@H](C(=O)O)N)CNC(=N)N", # Arginine (R)
14
+ "C1=C(NC=N1)C[C@@H](C(=O)O)N", # Histidine (H)
15
+ "C1C[C@H](NC1)C(=O)O", # Proline (P)
16
+ "C(CCN)C[C@@H](C(=O)O)N", # Lysine (K)
17
+ "CSCC[C@@H](C(=O)O)N", # Methionine (M)
18
+ "C(CC(=O)N)[C@@H](C(=O)O)N", # Asparagine (N)
19
+ "C([C@@H](C(=O)O)N)C(=O)N", # Glutamine (Q)
20
+ "C(CC(=O)O)[C@@H](C(=O)O)N", # Glutamic acid (E)
21
+ "OC(=O)C[C@@H](C(=O)O)N", # Aspartic acid (D)
22
+ "C1=CC(=CC=C1C[C@@H](C(=O)O)N)O", # Tyrosine (Y)
23
+ "C1=CC=C(C=C1)C[C@@H](C(=O)O)N", # Phenylalanine (F)
24
+ "CC[C@H](C)[C@@H](C(=O)O)N", # Valine (V)
25
+ "CC(C)C[C@@H](C(=O)O)N", # Leucine (L)
26
+ "CC(C)[C@@H](C(=O)O)N", # Isoleucine (I)
27
+ "C[C@H]([C@@H](C(=O)O)N)O", # Threonine (T)
28
+ "C([C@@H](C(=O)O)N)S", # Cysteine (C)
29
+ "C([C@@H](C(=O)O)N)O", # Serine (S)
30
+ "C[C@@H](C(=O)O)N", # Alanine (A)
31
+ "C(C(=O)O)N" # Glycine (G)
32
+ ]
33
+ one_letter_codes = ['W','R','H','P','K','M','N','Q','E','D','Y','F','V','L','I','T','C','S','A','G']
34
+
35
+ canonical_df = pd.DataFrame({
36
+ 'ID': one_letter_codes,
37
+ 'SMILES': canonical_smiles_list,
38
+ 'CANONICAL': ['True'] * len(canonical_smiles_list),
39
+ 'TERMINAL': ['NotTer'] * len(canonical_smiles_list),
40
+ 'ROMol': [Chem.MolFromSmiles(smi) for smi in canonical_smiles_list]
41
+ })
42
+
43
+ return pd.concat([df, canonical_df], ignore_index=True)
44
+
45
+ def cal_tanimoto(mol):
46
+ l_glycine = Chem.MolFromSmiles("C(C(=O)O)N")
47
+ fp1 = rdMolDescriptors.GetMorganFingerprint(mol, 2)
48
+ fp2 = rdMolDescriptors.GetMorganFingerprint(l_glycine, 2)
49
+ return DataStructs.TanimotoSimilarity(fp1, fp2)
50
+
51
+ def fetch_pubchem_name(smiles):
52
+ try:
53
+ url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/property/Title/JSON"
54
+ response = requests.get(url)
55
+ response.raise_for_status()
56
+ data = response.json()
57
+ return data['PropertyTable']['Properties'][0].get('Title', 'NULL')
58
+ except (requests.exceptions.RequestException, KeyError, IndexError):
59
+ return "NULL"
60
+
61
+ def fetch_chembl_similarity(smiles, similarity_threshold=100):
62
+ try:
63
+ url = f"https://www.ebi.ac.uk/chembl/api/data/similarity/{smiles}/{similarity_threshold}"
64
+ response = requests.get(url)
65
+ response.raise_for_status()
66
+ root = ET.fromstring(response.content)
67
+ chembl_ids = [m.find('.//molecule_chembl_id').text for m in root.findall('.//molecule') if m.find('.//molecule_chembl_id') is not None]
68
+ return chembl_ids if chembl_ids else ["NULL"]
69
+ except requests.exceptions.RequestException:
70
+ return ["NULL"]
71
+
72
+ def fetch_names(smiles):
73
+ pubchem_name = fetch_pubchem_name(smiles)
74
+ chembl_names = fetch_chembl_similarity(smiles)
75
+ return pubchem_name, ",".join(chembl_names)
76
+
77
+ def fetch_rdkit_properties(smiles):
78
+ try:
79
+ mol = Chem.MolFromSmiles(smiles)
80
+ if mol is None:
81
+ return ["NULL"] * 7
82
+ weight = Descriptors.ExactMolWt(mol)
83
+ clogp = Descriptors.MolLogP(mol)
84
+ tpsa = Descriptors.TPSA(mol)
85
+ charge = Chem.GetFormalCharge(mol)
86
+ rotatable_bonds = Descriptors.NumRotatableBonds(mol)
87
+ h_donors = Descriptors.NumHDonors(mol)
88
+ h_acceptors = Descriptors.NumHAcceptors(mol)
89
+ return [weight, clogp, tpsa, charge, rotatable_bonds, h_donors, h_acceptors]
90
+ except Exception:
91
+ return ["NULL"] * 7
92
+
93
+ def count_monomers(mols_df):
94
+ monomers_dict = defaultdict(int)
95
+ for sequence in mols_df['SEQUENCE']:
96
+ if isinstance(sequence, str) and len(sequence) > 0:
97
+ tokens = re.findall('[A-Z][^A-Z]*', sequence)
98
+ for token in tokens:
99
+ monomers_dict[token] += 1
100
+ return monomers_dict
101
+
102
+ def main():
103
+ parser = argparse.ArgumentParser(description='Analyse non-natural amino acids (NNAA) from PubChem.')
104
+ parser.add_argument('--input_dir', help='Input directory containing the monomer data.', default='data/tmp')
105
+ parser.add_argument('--mols_file', help='File name relative to input_dir.', default='standard/sequences_standardized.txt')
106
+ parser.add_argument('-fetch_names', help='Fetch names from PubChem and ChEMBL.', action='store_true')
107
+ parser.add_argument('--target_type', help='Type of target: ncAAs or peptides?', default='ncAAs')
108
+ parser.add_argument('--output_file', help='Output CSV file name.', default='analysis.csv')
109
+ args = parser.parse_args()
110
+
111
+ mols_path = args.mols_file
112
+ output_path = os.path.join(args.input_dir, args.output_file)
113
+
114
+ df = pd.read_csv(mols_path, sep='\t')
115
+ df = df.dropna(subset=['SMILES']).drop_duplicates(subset=['SMILES'])
116
+ df['ROMol'] = df['SMILES'].apply(Chem.MolFromSmiles)
117
+
118
+ if args.fetch_names:
119
+ df[['PUBCHEM_NAME', 'CHEMBL_NAMES']] = df['SMILES'].apply(fetch_names).tolist()
120
+
121
+ df['Tanimoto_to_Glycine'] = df['ROMol'].apply(cal_tanimoto)
122
+ df[['MolWt', 'LogP', 'TPSA', 'FormalCharge', 'RotatableBonds', 'HydrogenDonors', 'HydrogenAcceptors']] = df['SMILES'].apply(fetch_rdkit_properties).tolist()
123
+
124
+ df.to_csv(output_path, index=False)
125
+ print(f"Processing completed. Results saved to {output_path}")
126
+
127
+ if __name__ == "__main__":
128
+ main()
src/demonomerizer.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import re, ast
4
+ from rdkit import Chem
5
+ import pandas as pd
6
+ from tqdm import tqdm
7
+ import argparse
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ import os
10
+
11
+ # Parse the input arguments
12
+ parser = argparse.ArgumentParser(description="Preprocess the generated sequences file")
13
+ parser.add_argument("--sequence_file", type=str, help="Path to the generated sequences file", default="sequences_generated.txt")
14
+ parser.add_argument("--NNAA_file", type=str, help="Path to the NNAA file", default="dictionary.txt")
15
+ parser.add_argument("--batch_size", type=int, help="Batch size for processing sequences", default=8)
16
+ parser.add_argument("--output_dir", type=str, help="Output directory", default="output")
17
+ parser.add_argument("--demonomerized_file", type=str, help="Output demonomerized file name", default="demonomerized.txt")
18
+
19
+ args = parser.parse_args()
20
+
21
+ valid_backbone = Chem.MolFromSmarts("[NH,NH2]CC(=O)")
22
+ valid_backbone_OH = Chem.MolFromSmarts("[NH,NH2]CC(=O)O")
23
+ peptide_bond_mol = Chem.MolFromSmarts("[N,n][C,c]C(=O)[*!O]") # [*!O] ensures it does not match AAter
24
+ edge_C = 2
25
+ edge_N = 0
26
+ edge_O = 4
27
+
28
+ name_smi_dict = {
29
+ # isomeric SMILES from pubchem. eg https://pubchem.ncbi.nlm.nih.gov/compound/Alanine except for Asp (from https://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=3309) and Arg (from https://en.wikipedia.org/wiki/Arginine)
30
+ "Wter": "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N",
31
+ "W": "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O))N",
32
+ "Rter": "C(C[C@@H](C(=O)O)N)CNC(=N)N",
33
+ "R": "C(C[C@@H](C(=O))N)CNC(=N)N",
34
+ "Hter": "C1=C(NC=N1)C[C@@H](C(=O)O)N",
35
+ "H": "C1=C(NC=N1)C[C@@H](C(=O))N",
36
+ "Pter": "C1C[C@H](NC1)C(=O)O",
37
+ "P": "C1C[C@H](NC1)C(=O)",
38
+ "Kter": "C(CCN)C[C@@H](C(=O)O)N",
39
+ "K": "C(CCN)C[C@@H](C(=O))N",
40
+ "Mter": "CSCC[C@@H](C(=O)O)N",
41
+ "M": "CSCC[C@@H](C(=O))N",
42
+ "Qter": "C(CC(=O)N)[C@@H](C(=O)O)N",
43
+ "Q": "C(CC(=O)N)[C@@H](C(=O))N",
44
+ "Nter": "C([C@@H](C(=O)O)N)C(=O)N",
45
+ "N": "C([C@@H](C(=O))N)C(=O)N",
46
+ "Eter": "C(CC(=O)O)[C@@H](C(=O)O)N",
47
+ "E": "C(CC(=O)O)[C@@H](C(=O))N",
48
+ "Dter": "OC(=O)C[C@@H](C(=O)O)N",
49
+ "D": "OC(=O)C[C@@H](C(=O))N",
50
+ "Yter": "C1=CC(=CC=C1C[C@@H](C(=O)O)N)O",
51
+ "Y": "C1=CC(=CC=C1C[C@@H](C(=O))N)O",
52
+ "Fter": "C1=CC=C(C=C1)C[C@@H](C(=O)O)N",
53
+ "F": "C1=CC=C(C=C1)C[C@@H](C(=O))N",
54
+ "Iter": "CC[C@H](C)[C@@H](C(=O)O)N", # TODO add correct hydroxyl oxygen for every AA terminal
55
+ "I": "CC[C@H](C)[C@@H](C(=O))N",
56
+ "Lter": "CC(C)C[C@@H](C(=O)O)N",
57
+ "L": "CC(C)C[C@@H](C(=O))N",
58
+ "Vter": "CC(C)[C@@H](C(=O)O)N",
59
+ "V": "CC(C)[C@@H](C(=O))N",
60
+ "Tter": "C[C@H]([C@@H](C(=O)O)N)O",
61
+ "T": "C[C@H]([C@@H](C(=O))N)O",
62
+ "Cter": "C([C@@H](C(=O)O)N)S",
63
+ "C": "C([C@@H](C(=O))N)S",
64
+ "Ster": "C([C@@H](C(=O)O)N)O",
65
+ "S": "C([C@@H](C(=O))N)O",
66
+ "Ater": "C[C@@H](C(=O)O)N",
67
+ "A": "C[C@@H](C(=O))N",
68
+ "Gter": "C(C(=O)O)N",
69
+ "G": "C(C(=O))N",
70
+ }
71
+
72
+ def mark_edge(amino, pattern, edge_position):
73
+ matched_indices = amino.GetSubstructMatch(pattern)
74
+ edge_position = matched_indices[edge_position]
75
+ edge_atom = amino.GetAtomWithIdx(edge_position)
76
+ edge_atom.SetProp("atomNote", "edge")
77
+ return edge_atom
78
+
79
+ def mark_edge_NNAA(NNAA, bond_sites):
80
+ try:
81
+ for i in bond_sites:
82
+ integer = int(i)
83
+ atom = NNAA.GetAtomWithIdx(integer)
84
+ atom.SetProp("atomNote", "edge")
85
+ except:
86
+ print("No bond sites")
87
+ pass
88
+
89
+ def mark_bond_site(mol, index, symbol):
90
+ for atom in mol.GetAtoms():
91
+ if atom.HasProp("atomNote") and atom.GetSymbol() == symbol:
92
+ atom.SetProp("atomNote", str(index))
93
+
94
+ def clear_props(atom1, atom2):
95
+ atom1.ClearProp("atomNote")
96
+ atom2.ClearProp("atomNote")
97
+
98
+ def get_amino_mol(amino_name, name_smi_dict, NNAA_file):
99
+ for aa_name, aa_smi in name_smi_dict.items():
100
+ if aa_name == amino_name:
101
+ amino_mol = Chem.MolFromSmiles(aa_smi)
102
+ try:
103
+ mark_edge(amino_mol, valid_backbone, edge_N)
104
+ mark_edge(amino_mol, valid_backbone, edge_C)
105
+ except:
106
+ for index, row in NNAA_file.iterrows():
107
+ name = row["ID"]
108
+ if name == amino_name:
109
+ bond_info = ast.literal_eval(row["Bond sites"])
110
+ smiles_rootedAtAtom0 = bond_info[0]
111
+ bond_sites = bond_info[1:]
112
+ amino_mol = Chem.MolFromSmiles(smiles_rootedAtAtom0)
113
+ mark_edge_NNAA(amino_mol, bond_sites)
114
+ return amino_mol
115
+
116
+
117
+ def process_batch(batch_df):
118
+ results = []
119
+ for index, row in batch_df.iterrows():
120
+ result_index, result_smiles = process_row(index, row)
121
+ results.append((result_index, result_smiles))
122
+ return results
123
+
124
+ def process_row(index, row):
125
+ if "SMILES" not in row or type(row["SMILES"]) == float or len(row["SMILES"]) == 0:
126
+ seq = row["SEQUENCE"]
127
+ split_seq = regex.findall(seq)
128
+ ordered_aminos = []
129
+
130
+ try:
131
+ for alphabet in split_seq:
132
+ amino_mol = get_amino_mol(alphabet, name_smi_dict, NNAA_file)
133
+ ordered_aminos.append(amino_mol)
134
+
135
+ # Replace the last amino with the terminal amino
136
+ amino_ter = split_seq[-1]
137
+ if not "ter" in amino_ter and not amino_ter.startswith("Z"):
138
+ amino_ter = f"{amino_ter}ter"
139
+ last_mol = get_amino_mol(amino_ter, name_smi_dict, NNAA_file)
140
+ ordered_aminos[-1] = last_mol
141
+
142
+ combined = ordered_aminos[0]
143
+ for i in range(len(ordered_aminos)-1):
144
+ mark_bond_site(combined, i, "C")
145
+ next_amino = ordered_aminos[i+1]
146
+ mark_bond_site(next_amino, i+1, "N")
147
+ combined = Chem.CombineMols(combined, next_amino)
148
+ rwmol = Chem.RWMol(combined)
149
+ for atom1 in rwmol.GetAtoms():
150
+ if atom1.HasProp("atomNote") and atom1.GetProp("atomNote") == f"{i}":
151
+ for atom2 in rwmol.GetAtoms():
152
+ if atom2.HasProp("atomNote") and atom2.GetProp("atomNote") == f"{i+1}":
153
+ rwmol.AddBond(atom1.GetIdx(), atom2.GetIdx(), Chem.BondType.SINGLE)
154
+ clear_props(atom1, atom2)
155
+ if len(rwmol.GetSubstructMatches(peptide_bond_mol)) == i+1:
156
+ combined = rwmol.GetMol()
157
+ break
158
+
159
+ result = Chem.MolToSmiles(combined, isomericSmiles=True, rootedAtAtom=0, canonical=True)
160
+ if '.' in result:
161
+ return index, None # Indicates unbound atoms
162
+ return index, result
163
+
164
+ except Exception as e:
165
+ print(f"Error in sequence: {seq}")
166
+ return index, None
167
+
168
+ return index, row.get("SMILES") # Return the existing SMILES if present
169
+
170
+ NNAA_file = pd.read_csv(args.NNAA_file, sep="\t")
171
+ for index, row in NNAA_file.iterrows():
172
+ smiles = row["SMILES"]
173
+ name = row["ID"]
174
+ NNAA = Chem.MolFromSmiles(smiles)
175
+ if NNAA.HasSubstructMatch(valid_backbone_OH):
176
+ rwmol = Chem.RWMol(NNAA)
177
+ OH_i = NNAA.GetSubstructMatch(valid_backbone_OH)[edge_O]
178
+ rwmol.RemoveAtom(OH_i)
179
+ noOH_smiles = Chem.MolToSmiles(rwmol)
180
+ name_smi_dict[name] = noOH_smiles
181
+ if not name.startswith("Z"):
182
+ name = f"{name}ter"
183
+ name_smi_dict[name] = smiles
184
+
185
+ tokenizer = r"X\d+|Z\d+|[A-WY]"
186
+ regex = re.compile(tokenizer)
187
+
188
+ df = pd.read_csv(args.sequence_file, sep="\t")
189
+
190
+ # add a column for SMILES
191
+ df["SMILES"] = ""
192
+
193
+ # Process in batches
194
+ batch_size = args.batch_size
195
+ batches = [df[i:i + batch_size] for i in range(0, df.shape[0], batch_size)]
196
+
197
+ # Process batches in parallel
198
+ with ThreadPoolExecutor() as executor:
199
+ futures = {executor.submit(process_batch, batch): batch for batch in batches}
200
+ for future in tqdm(as_completed(futures), total=len(futures)):
201
+ results = future.result()
202
+ for index, smiles in results:
203
+ if smiles:
204
+ df.at[index, "SMILES"] = smiles
205
+
206
+ # ✅ Use output_dir in your logic
207
+ os.makedirs(args.output_dir, exist_ok=True)
208
+ output_file = os.path.join(args.output_dir, args.demonomerized_file)
209
+
210
+ # Assuming `df` is your final DataFrame
211
+ df.to_csv(output_file, sep="\t", index=False)
src/draw.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import matplotlib.pyplot as plt
3
+ from matplotlib.colors import ListedColormap
4
+ from collections import defaultdict
5
+ from rdkit.Chem.Draw import rdMolDraw2D
6
+ import numpy as np
7
+
8
+ class MoleculeDrawer:
9
+ def __init__(self, output_dir="output/tmp"):
10
+ self.output_dir = os.path.join(output_dir, "raw/images")
11
+ os.makedirs(self.output_dir, exist_ok=True)
12
+ self.aa2color_dict = {
13
+ "Asp": (0.902, 0.039, 0.039), "Glu": (0.961, 0.1, 0.537), "Arg": (0.078, 0.353, 1), "Lys": (0.42, 0.353, 1),
14
+ "His": (0.51, 0.51, 0.824), "Tyr": (0.196, 0.196, 0.667), "Phe": (0.341, 0.196, 0.667), "Trp": (0.706, 0.353, 0.706),
15
+ "Asn": (0, 0.863, 0.863), "Gln": (0.5, 0.82, 0.863), "Met": (0.902, 0.902, 0), "Cys": (0.722, 0.902, 0),
16
+ "Ser": (0.98, 0.588, 0), "Thr": (0, 0.612, 0.412), "Gly": (0.98, 0.922, 0.922), "Ala": (0.784, 0.784, 0.639),
17
+ "Val": (0.059, 0.51, 0.059), "Leu": (0.29, 0.51, 0.059), "Ile": (0.29, 0.51, 0.471), "Pro": (1, 0.588, 0.51)
18
+ }
19
+
20
+ def sort_atom_highlights(self, mol):
21
+ atom_highlights = defaultdict(list)
22
+ for atom_idx in range(mol.GetNumAtoms()):
23
+ labelled_atom = mol.GetAtomWithIdx(atom_idx)
24
+ AA_label = labelled_atom.GetProp("AA")
25
+ if self.label_belongs_to_AA(AA_label):
26
+ three_letter_label = AA_label[:3]
27
+ atom_highlights[atom_idx].append(self.aa2color_dict[three_letter_label])
28
+
29
+ # Convert defaultdict to dict of lists
30
+ return {k: list(v) for k, v in atom_highlights.items()}
31
+
32
+ def create_colormap(self):
33
+ legend_data = [(aa[:3], color) for aa, color in self.aa2color_dict.items() if aa != "Unk"]
34
+ fig, ax = plt.subplots(figsize=(1, 1))
35
+ cmap = ListedColormap([color for _, color in legend_data])
36
+ cax = ax.matshow(np.arange(len(legend_data)).reshape(1, -1), cmap=cmap)
37
+ cbar = fig.colorbar(cax, ticks=np.arange(len(legend_data)), aspect=5)
38
+ cbar.set_ticklabels([label for label, _ in legend_data])
39
+ cbar.ax.tick_params(labelsize=3)
40
+ ax.axis("off")
41
+ plt.savefig(os.path.join(self.output_dir, "colormap.png"), bbox_inches="tight", dpi=300)
42
+ plt.close()
43
+
44
+ def draw_input_mol(self, mol, mol_index, seq, bond_highlights):
45
+ atom_highlights = self.sort_atom_highlights(mol)
46
+
47
+ # Ensure bond_highlights is a dict of lists
48
+ bond_highlights = {k: list(v) for k, v in bond_highlights.items()} if bond_highlights else {}
49
+
50
+ mol_name = f"mol_{mol_index}"
51
+ legend = f'{mol_name}\nseq: {seq}\n{"8< = peptide bond"}\nAA_NAME:SEEN_COUNT:SEQUENCE_POSITION\n'
52
+
53
+ self.draw_mol(mol, atom_highlights, bond_highlights, legend, mol_name)
54
+ self.create_colormap()
55
+
56
+
57
+ def draw_mol(self, mol, atom_highlights, bond_highlights, legend, mol_name):
58
+ view = rdMolDraw2D.MolDraw2DSVG(600, 300)
59
+ view.drawOptions().useBWAtomPalette()
60
+ view.DrawMoleculeWithHighlights(mol, legend, dict(atom_highlights), dict(bond_highlights), {}, {})
61
+ view.FinishDrawing()
62
+ with open(os.path.join(self.output_dir, f"{mol_name}.svg"), "w") as f:
63
+ f.write(view.GetDrawingText())
64
+
65
+ def label_belongs_to_AA(self, label):
66
+ shorter_label = label[:3]
67
+ return shorter_label != "Unk" and not label.startswith("X")
src/monomer_analyzer.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/monomerizer.py ADDED
@@ -0,0 +1,882 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ # This script takes a isomeric SMILES file as input and outputs a seq (like fasta) file with the corresponding amino acid sequence.
4
+ # The script also outputs a isomeric SMILES file with the NNAA (non-natural amino acid) labeled as "X".
5
+ # Any compound connected to a valid backbone is considered as individual amino acid.
6
+ # The NNAAs that do not possess a valid backbone "[NH,NH2]CC(=O)O" required to continuously form peptide bonds, are considered as terminal modifications, and are named as "X0ter", "X1ter", etc.
7
+
8
+ import os
9
+ from rdkit import Chem
10
+ from rdkit.Chem import RegistrationHash
11
+ from rdkit.Chem.RegistrationHash import HashLayer
12
+ from collections import deque
13
+ import argparse
14
+ from tqdm import tqdm
15
+ import pandas as pd
16
+ from concurrent.futures import ThreadPoolExecutor, as_completed
17
+ import multiprocessing as mp
18
+ from draw import MoleculeDrawer
19
+ from collections import defaultdict
20
+
21
+
22
+ def parse_arguments():
23
+ parser = argparse.ArgumentParser(description="Process SMILES files and generate amino acid sequences.")
24
+ parser.add_argument("--input_file", default="demo/example_smiles.txt", help="Input SMILES file")
25
+ parser.add_argument("-process_cyclic", action="store_true", help="Process cyclic peptides")
26
+ parser.add_argument("--min_amino_acids", type=int, default=3, help="Minimum number of amino acids")
27
+ parser.add_argument("--batch_size", type=int, default=100, help="Batch size")
28
+ parser.add_argument("--output_dir", default="output/tmp", help="Output directory")
29
+ parser.add_argument("--max_workers", type=int, default=mp.cpu_count(), help="Maximum number of workers for parallel processing")
30
+ parser.add_argument("-draw", action="store_true", help="Draw molecules")
31
+ return parser.parse_args()
32
+
33
+ name_smi_dict = {
34
+ # isomeric SMILES from pubchem. eg https://pubchem.ncbi.nlm.nih.gov/compound/Alanine except for Asp (from https://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=3309) and Arg (from https://en.wikipedia.org/wiki/Arginine)
35
+ "TrpTer": "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N",
36
+ "Trp": "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O))N",
37
+ "ArgTer": "C(C[C@@H](C(=O)O)N)CNC(=N)N",
38
+ "Arg Ter": "NC(N)=NCCC[C@H](N)C(=O)O",
39
+ "Arg": "C(C[C@@H](C(=O))N)CNC(=N)N",
40
+ "Arg2": "NC(N)=NCCC[C@H](N)C(=O)",
41
+ "HisTer": "C1=C(NC=N1)C[C@@H](C(=O)O)N",
42
+ "His": "C1=C(NC=N1)C[C@@H](C(=O))N",
43
+ "ProTer": "C1C[C@H](NC1)C(=O)O",
44
+ "Pro": "C1C[C@H](NC1)C(=O)",
45
+ "LysTer": "C(CCN)C[C@@H](C(=O)O)N",
46
+ "Lys": "C(CCN)C[C@@H](C(=O))N",
47
+ "MetTer": "CSCC[C@@H](C(=O)O)N",
48
+ "Met": "CSCC[C@@H](C(=O))N",
49
+ "GlnTer": "C(CC(=O)N)[C@@H](C(=O)O)N",
50
+ "Gln": "C(CC(=O)N)[C@@H](C(=O))N",
51
+ "AsnTer": "C([C@@H](C(=O)O)N)C(=O)N",
52
+ "Asn": "C([C@@H](C(=O))N)C(=O)N",
53
+ "GluTer": "C(CC(=O)O)[C@@H](C(=O)O)N",
54
+ "Glu": "C(CC(=O)O)[C@@H](C(=O))N",
55
+ "AspTer": "OC(=O)C[C@@H](C(=O)O)N",
56
+ "Asp": "OC(=O)C[C@@H](C(=O))N",
57
+ "TyrTer": "C1=CC(=CC=C1C[C@@H](C(=O)O)N)O",
58
+ "Tyr": "C1=CC(=CC=C1C[C@@H](C(=O))N)O",
59
+ "PheTer": "C1=CC=C(C=C1)C[C@@H](C(=O)O)N",
60
+ "Phe": "C1=CC=C(C=C1)C[C@@H](C(=O))N",
61
+ "IleTer": "CC[C@H](C)[C@@H](C(=O)O)N", # TODO add correct hydroxyl oxygen for every AA terminal
62
+ "Ile": "CC[C@H](C)[C@@H](C(=O))N",
63
+ "LeuTer": "CC(C)C[C@@H](C(=O)O)N",
64
+ "Leu": "CC(C)C[C@@H](C(=O))N",
65
+ "ValTer": "CC(C)[C@@H](C(=O)O)N",
66
+ "Val": "CC(C)[C@@H](C(=O))N",
67
+ "ThrTer": "C[C@H]([C@@H](C(=O)O)N)O",
68
+ "Thr": "C[C@H]([C@@H](C(=O))N)O",
69
+ "CysTer": "C([C@@H](C(=O)O)N)S",
70
+ "Cys": "C([C@@H](C(=O))N)S",
71
+ "SerTer": "C([C@@H](C(=O)O)N)O",
72
+ "Ser": "C([C@@H](C(=O))N)O",
73
+ "AlaTer": "C[C@@H](C(=O)O)N",
74
+ # FBR: I wonder if we should have a SMILES for AlaStart
75
+ "Ala": "C[C@@H](C(=O))N",
76
+ # Saturated the carbon
77
+ "GlyTer": "C(C(=O)O)N",
78
+ "Gly": "C(C(=O))N",
79
+ }
80
+
81
+ smi2mol = {}
82
+ for aa_name, aa_smi in name_smi_dict.items():
83
+ smi2mol[aa_name] = Chem.MolFromSmiles(aa_smi)
84
+
85
+ peptide_bond_mol = Chem.MolFromSmarts("[N,n][C,c]C(=O)[*!O]") # [*!O] ensures it does not match AAter
86
+ edge_C_position = 2
87
+ edge_N_position = 4
88
+ valid_backbone = Chem.MolFromSmarts("[NH,NH2]CC(=O)[OH]")
89
+ loose_backbone = Chem.MolFromSmarts("[C,c](C(=O)O)[N,n]") # Also detects backbone that contains a benzene ring. Used for removing -OH
90
+ OH_position = 3
91
+ oxygen = Chem.Atom(8)
92
+
93
+ three2one_letter = {
94
+ "Ala": "A",
95
+ "Gly": "G",
96
+ "Ile": "I",
97
+ "Leu": "L",
98
+ "Pro": "P",
99
+ "Val": "V",
100
+ "Phe": "F",
101
+ "Trp": "W",
102
+ "Tyr": "Y",
103
+ "Asp": "D",
104
+ "Glu": "E",
105
+ "Arg": "R",
106
+ "His": "H",
107
+ "Lys": "K",
108
+ "Ser": "S",
109
+ "Thr": "T",
110
+ "Cys": "C",
111
+ "Met": "M",
112
+ "Asn": "N",
113
+ "Gln": "Q",
114
+ }
115
+
116
+ aa2color_dict = {
117
+ "Asp": (0.902, 0.039, 0.039),
118
+ "Glu": (0.961, 0.1, 0.537),
119
+ "Arg": (0.078, 0.353, 1),
120
+ "Lys": (0.42, 0.353, 1),
121
+ "His": (0.51, 0.51, 0.824),
122
+ "Tyr": (0.196, 0.196, 0.667),
123
+ "Phe": (0.341, 0.196, 0.667),
124
+ "Trp": (0.706, 0.353, 0.706),
125
+ "Asn": (0, 0.863, 0.863),
126
+ "Gln": (0.5, 0.82, 0.863),
127
+ "Met": (0.902, 0.902, 0),
128
+ "Cys": (0.722, 0.902, 0),
129
+ "Ser": (0.98, 0.588, 0),
130
+ "Thr": (0, 0.612, 0.412),
131
+ "Gly": (0.98, 0.922, 0.922),
132
+ "Ala": (0.784, 0.784, 0.639),
133
+ "Val": (0.059, 0.51, 0.059),
134
+ "Leu": (0.29, 0.51, 0.059),
135
+ "Ile": (0.29, 0.51, 0.471),
136
+ "Pro": (1, 0.588, 0.51),
137
+ }
138
+
139
+ # no integer in the tuple was already matched
140
+ def tuple_fully_unmatched(indexes_group, already_matched, mol_a):
141
+ res = True
142
+ for i in indexes_group:
143
+ if mol_a.GetAtomWithIdx(i).HasProp("AA") and mol_a.GetAtomWithIdx(i).GetProp(
144
+ "AA"
145
+ ).startswith("Unk"):
146
+ res = False
147
+ break
148
+ if i in already_matched:
149
+ res = False
150
+ break
151
+ return res
152
+
153
+
154
+ def match_AA(mol_b, dict):
155
+ atoms_already_matched = set()
156
+ for aa_name, aa_mol in dict.items():
157
+ i = 0
158
+ for atom_indexes_group in mol_b.GetSubstructMatches(aa_mol, useChirality=True):
159
+ prop = aa_name + ":" + str(i)
160
+ if tuple_fully_unmatched(atom_indexes_group, atoms_already_matched, mol_b):
161
+ for a_i in atom_indexes_group:
162
+ mol_b.GetAtomWithIdx(a_i).SetProp("AA", prop)
163
+ atoms_already_matched.add(a_i)
164
+ i += 1
165
+
166
+
167
+ def find_peptide_bonds(mol_c):
168
+ atom_indices_surrounding_peptide_bond = []
169
+ for bonded_AA in mol_c.GetSubstructMatches(peptide_bond_mol):
170
+ C_idx = mol_c.GetAtomWithIdx(bonded_AA[edge_C_position]).GetIdx()
171
+ N_idx = mol_c.GetAtomWithIdx(bonded_AA[edge_N_position]).GetIdx()
172
+ atom_indices_surrounding_peptide_bond.append([C_idx, N_idx])
173
+ return atom_indices_surrounding_peptide_bond
174
+
175
+
176
+ def set_peptide_bond_prop(mol, atom_indices_surrounding_peptide_bond):
177
+ peptide_bonds = []
178
+ for C_idx, N_idx in atom_indices_surrounding_peptide_bond:
179
+ mol.GetAtomWithIdx(C_idx).SetProp("bond_site", "C")
180
+ mol.GetAtomWithIdx(N_idx).SetProp("bond_site", "N")
181
+ peptide_bond = mol.GetBondBetweenAtoms(C_idx, N_idx)
182
+ peptide_bond.SetProp("bondNote", "8<")
183
+ peptide_bond.SetProp("peptide_bond", "peptide_bond")
184
+ peptide_bonds.append(peptide_bond.GetIdx())
185
+ return peptide_bonds
186
+
187
+
188
+ def label_peptide_bonds(mol_e):
189
+ atom_indices_surrounding_peptide_bond = find_peptide_bonds(mol_e)
190
+ peptide_bonds = set_peptide_bond_prop(mol_e, atom_indices_surrounding_peptide_bond)
191
+ return peptide_bonds
192
+
193
+
194
+ def label_NNAAs(mol_e, peptide_bonds):
195
+ NNAA_idx = 0
196
+ for a_i in range(mol_e.GetNumHeavyAtoms()):
197
+ the_atom = mol_e.GetAtomWithIdx(a_i)
198
+ if not the_atom.HasProp("AA"):
199
+ atom_index_of_the_NNAA = the_atom.GetIdx()
200
+ label_unmatched_NNAA(
201
+ mol_e, atom_index_of_the_NNAA, NNAA_idx, peptide_bonds
202
+ )
203
+ NNAA_idx += 1
204
+ return NNAA_idx
205
+
206
+ def prepare_graph(first_atom_index):
207
+ queue = deque([first_atom_index])
208
+ visited = set([first_atom_index])
209
+ return queue, visited
210
+
211
+
212
+ def enqueue_neighbor_indices(mol_f, atom, queue, visited):
213
+ neighbor_indices = [neighbor[1] for neighbor in get_neighbors(mol_f, atom)]
214
+ for neighbor_atom_idx in neighbor_indices:
215
+ if neighbor_atom_idx not in visited:
216
+ queue.append(neighbor_atom_idx)
217
+ visited.add(neighbor_atom_idx)
218
+ return queue, visited
219
+
220
+
221
+ def get_neighbors(mol_g, atom):
222
+ neighbors_and_indices = []
223
+ for neighbor_atom in atom.GetNeighbors():
224
+ neighbor_atom_idx = neighbor_atom.GetIdx()
225
+ neighbor_atom = mol_g.GetAtomWithIdx(neighbor_atom_idx)
226
+ neighbors_and_indices.append([neighbor_atom, neighbor_atom_idx])
227
+ return neighbors_and_indices
228
+
229
+
230
+ def cross_peptide_bond(mol_f, current_atom_idx, neighbor_idx, peptide_bonds):
231
+ bond_i = mol_f.GetBondBetweenAtoms(current_atom_idx, neighbor_idx).GetIdx()
232
+ return bond_i in peptide_bonds
233
+
234
+
235
+ def NNAA_continues(neighbor_atom, first_AA_observed):
236
+ return (
237
+ neighbor_atom.HasProp("AA") == False
238
+ or neighbor_atom.GetProp("AA") == first_AA_observed
239
+ )
240
+
241
+
242
+ def get_current_atom_with_prop(mol_h, atom_idx_queue, prop):
243
+ current_atom_idx = atom_idx_queue.popleft()
244
+ current_atom = mol_h.GetAtomWithIdx(current_atom_idx)
245
+ current_atom.SetProp("AA", prop)
246
+ return current_atom, current_atom_idx
247
+
248
+
249
+ def label_unmatched_NNAA(mol, atom_index_of_the_NNAA, NNAA_idx, peptide_bonds):
250
+ atom_idx_queue, visited_atoms = prepare_graph(atom_index_of_the_NNAA)
251
+ first_AA_observed = None
252
+ prop = f"Unk{NNAA_idx}"
253
+ while atom_idx_queue:
254
+ current_atom, current_atom_idx = get_current_atom_with_prop(
255
+ mol, atom_idx_queue, prop
256
+ )
257
+ neighbors_and_indices = get_neighbors(mol, current_atom)
258
+ for neighbor in neighbors_and_indices:
259
+ neighbor_atom, neighbor_idx = neighbor
260
+ if neighbor_idx not in visited_atoms and not cross_peptide_bond(
261
+ mol, current_atom_idx, neighbor_idx, peptide_bonds
262
+ ):
263
+ visited_atoms.add(neighbor_idx)
264
+ if NNAA_continues(neighbor_atom, first_AA_observed):
265
+ atom_idx_queue.append(neighbor_idx)
266
+ elif first_AA_observed is None: # first_AA_observed unseen
267
+ first_AA_observed = neighbor_atom.GetProp("AA")
268
+ atom_idx_queue.append(neighbor_idx)
269
+
270
+
271
+ def get_first_base_aa(mol_j, first_atom_index):
272
+ first_atom = mol_j.GetAtomWithIdx(first_atom_index)
273
+ current_base_aa = first_atom.GetProp("AA")
274
+ return current_base_aa
275
+
276
+ def label_boundary_bonds(mol):
277
+ for bond in mol.GetBonds():
278
+ atom1_i, atom2_i, prop1, prop2 = get_connected_atoms_and_props(bond, mol)
279
+ if (
280
+ prop1 != prop2
281
+ ):
282
+ bond.SetProp("boundary", "boundary")
283
+ mol.GetAtomWithIdx(atom1_i).SetProp("bond_site", "bond_site")
284
+ mol.GetAtomWithIdx(atom2_i).SetProp("bond_site", "bond_site")
285
+
286
+ def add_order_to_atomNote(mol_v, aa_order, current_base_aa):
287
+ for atom_idx in range(mol_v.GetNumAtoms()):
288
+ atom = mol_v.GetAtomWithIdx(atom_idx)
289
+ if atom.GetProp("AA") == current_base_aa:
290
+ atom.SetProp("atomNote", f"{current_base_aa}:{aa_order}")
291
+
292
+ def reorder_AAs(mol_k, first_atom_index):
293
+ atom_idx_queue, visited_atom_indices = prepare_graph(first_atom_index)
294
+ aa_list = []
295
+ aa_order = 1
296
+ current_base_aa = get_first_base_aa(mol_k, first_atom_index)
297
+
298
+ while atom_idx_queue:
299
+ add_order_to_atomNote(mol_k, aa_order, current_base_aa)
300
+ atom_index = atom_idx_queue.popleft()
301
+ the_atom = mol_k.GetAtomWithIdx(atom_index)
302
+ aa_in_question = the_atom.GetProp("AA")
303
+ if current_base_aa != aa_in_question:
304
+ current_base_aa, atom_idx_queue = switch_base_and_empty_queue(
305
+ aa_list, current_base_aa, aa_in_question, atom_idx_queue, atom_index
306
+ )
307
+ aa_order += 1
308
+ enqueue_neighbor_indices(mol_k, the_atom, atom_idx_queue, visited_atom_indices)
309
+
310
+ aa_list.append(current_base_aa) # append the last AA
311
+ return aa_list
312
+
313
+
314
+ def switch_base_and_empty_queue(
315
+ aa_list, current_base_aa, aa_in_question, atom_idx_queue, idx
316
+ ):
317
+ aa_list.append(current_base_aa)
318
+ current_base_aa = aa_in_question
319
+ atom_idx_queue = deque([idx])
320
+ return current_base_aa, atom_idx_queue
321
+
322
+ def label_belongs_to_AA(label):
323
+ shorter_label = label[:3]
324
+ return shorter_label != "Unk" and not label.startswith("X")
325
+
326
+
327
+ def record_if_terminal(peptide_bonded_props, peptide_bonded_atoms, prop, atom):
328
+ if (
329
+ prop in peptide_bonded_props
330
+ ): # the peptide bond was seen twice i.e. it has both ends
331
+ peptide_bonded_props.remove(prop)
332
+ else:
333
+ peptide_bonded_props.append(prop)
334
+ peptide_bonded_atoms.append(atom)
335
+
336
+
337
+ def get_first_atom_index(mol_l, peptide_bonded_props, peptide_bonded_atoms):
338
+ first_atom_index = 0
339
+ for a_i in range(mol_l.GetNumAtoms()):
340
+ a = mol_l.GetAtomWithIdx(a_i)
341
+ if (
342
+ a_i in peptide_bonded_atoms
343
+ and a.GetProp("AA") in peptide_bonded_props
344
+ and a.GetSymbol() == "C"
345
+ ):
346
+ first_atom_index = a_i
347
+ break
348
+ return first_atom_index
349
+
350
+
351
+ def mol_is_cyclic_peptide(mol_u, ignore_cyclic_peptide):
352
+ if ignore_cyclic_peptide == False:
353
+ return False
354
+ for bond in mol_u.GetBonds(): # for any bond including peptide bonds
355
+ if bond.IsInRing() and (bond.HasProp("boundary") or bond.HasProp("peptide_bond")):
356
+ return True
357
+
358
+
359
+ def search_terminal_AA(mol_m): # for highlight and searching terminal AA
360
+ peptide_bonded_props, peptide_bonded_atoms = [], []
361
+ for bond in mol_m.GetBonds(): # for any bond including peptide bonds
362
+ atom1_i, atom2_i, prop1, prop2 = get_connected_atoms_and_props(bond, mol_m)
363
+ if bond.HasProp(
364
+ "peptide_bond"
365
+ ): # will remain in the list only if it is connected to a terminal AA
366
+ record_if_terminal(
367
+ peptide_bonded_props, peptide_bonded_atoms, prop1, atom1_i
368
+ )
369
+ record_if_terminal(
370
+ peptide_bonded_props, peptide_bonded_atoms, prop2, atom2_i
371
+ )
372
+ return peptide_bonded_props, peptide_bonded_atoms
373
+
374
+
375
+ def get_connected_atoms_and_props(bond, mol_t):
376
+ atom1_i, atom2_i = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
377
+ prop1, prop2 = mol_t.GetAtomWithIdx(atom1_i).GetProp("AA"), mol_t.GetAtomWithIdx(
378
+ atom2_i
379
+ ).GetProp("AA")
380
+ return atom1_i, atom2_i, prop1, prop2
381
+
382
+ def write_seq(aa_list):
383
+ split_seq = []
384
+ for aa in aa_list:
385
+ if aa[:3] == ("Unk"):
386
+ acid = "?"
387
+ elif aa.startswith("X"):
388
+ acid = aa.split(":")[0]
389
+ else:
390
+ acid = three2one_letter[aa[:3]]
391
+ split_seq.append(acid)
392
+ return split_seq
393
+
394
+ def get_NNAAs(mol):
395
+ rwmol = Chem.RWMol(mol)
396
+ remove_peptide_bonds(rwmol) # this needs to come before remove_atoms
397
+ remove_atoms(rwmol, mol, label_belongs_to_AA)
398
+ try:
399
+ return Chem.GetMolFrags(rwmol, asMols=True, sanitizeFrags=True)
400
+ except ValueError:
401
+ return "error"
402
+
403
+
404
+ def remove_atoms(rwmol, mol, func, **kwargs):
405
+ atom_number = mol.GetNumAtoms() - 1
406
+ while atom_number >= 0:
407
+ prop = rwmol.GetAtomWithIdx(atom_number).GetProp("AA")
408
+ if func(prop, **kwargs):
409
+ rwmol.RemoveAtom(atom_number)
410
+ atom_number -= 1
411
+
412
+ def add_OH(rwmol, begin_atom_idx, end_atom_idx):
413
+ rwmol.AddAtom(oxygen)
414
+ oxygen_idx = rwmol.GetNumAtoms() -1
415
+ if rwmol.GetAtomWithIdx(begin_atom_idx).GetAtomicNum() == 6: # Carbon
416
+ rwmol.AddBond(begin_atom_idx, oxygen_idx, Chem.BondType.SINGLE)
417
+ elif rwmol.GetAtomWithIdx(end_atom_idx).GetAtomicNum() == 6: # Carbon
418
+ rwmol.AddBond(oxygen_idx, end_atom_idx, Chem.BondType.SINGLE)
419
+
420
+
421
+ def remove_peptide_bonds(rwmol):
422
+ current_bond_idx = rwmol.GetNumBonds() - 1
423
+ while current_bond_idx >= 0:
424
+ current_bond = rwmol.GetBondWithIdx(current_bond_idx)
425
+ if current_bond.HasProp("peptide_bond") and current_bond.IsInRing() == False:
426
+ begin_atom_idx, end_atom_idx = current_bond.GetBeginAtomIdx(), current_bond.GetEndAtomIdx()
427
+ rwmol.RemoveBond(
428
+ begin_atom_idx, end_atom_idx
429
+ )
430
+ add_OH(rwmol, begin_atom_idx, end_atom_idx)
431
+ current_bond_idx -= 1
432
+
433
+ def detect_terminal(NNAA):
434
+ if NNAA.HasSubstructMatch(valid_backbone):
435
+ return "NotTer"
436
+ else:
437
+ return "ter" # don't use capital letter, for tokenization
438
+
439
+
440
+ def enlist_NNAA(new_NNAA, df, ter_or_not, bond_atom_indices):
441
+ new_smi = Chem.MolToSmiles(new_NNAA, isomericSmiles=True, canonical=True)
442
+ new_smi_rootedAtAtom0 = Chem.MolToSmiles(new_NNAA, isomericSmiles=True, canonical=True, rootedAtAtom=0)
443
+ bond_atom_indices = [new_smi_rootedAtAtom0] + bond_atom_indices
444
+ new_data = pd.DataFrame({
445
+ 'SMILES': [new_smi],
446
+ 'TERMINAL': [ter_or_not],
447
+ 'BOND SITES': [bond_atom_indices],
448
+ 'MOL': [new_NNAA]
449
+ })
450
+
451
+ df = pd.concat([df, new_data], ignore_index=True)
452
+
453
+ # deduplicate by SMILES
454
+ df = df.drop_duplicates(subset=['SMILES'])
455
+
456
+ return df
457
+
458
+ def add_IDs(df):
459
+ # group df by TAUTOMER HASH
460
+ tautomer_groups = df['TAUTOMER HASH'].drop_duplicates().reset_index(drop=True)
461
+
462
+ for i, tautomer_hash in enumerate(tautomer_groups):
463
+ df.loc[df['TAUTOMER HASH'] == tautomer_hash, 'ID'] = f"X{i}"
464
+
465
+ # if ['TERMINAL'] == 'ter', add 'ter' to the ID
466
+ df.loc[df['TERMINAL'] == 'ter', 'ID'] = df['ID'] + 'ter'
467
+
468
+ return df
469
+
470
+ def relabel_NNAA(mol, NNAA_df):
471
+ visited_Unk_labels, visited_NNAA_labels = [], []
472
+ for atom_idx in range(mol.GetNumAtoms()):
473
+ try:
474
+ label = mol.GetAtomWithIdx(atom_idx).GetProp("AA")
475
+ if label.startswith("Unk") and label not in visited_Unk_labels:
476
+ visited_Unk_labels.append(label)
477
+ rwmol_from_peptide = Chem.RWMol(mol)
478
+ remove_atoms(rwmol_from_peptide, mol, different_NNAA, Unk_label=label)
479
+ for idx, NNAA_row in NNAA_df.iterrows():
480
+ if perfect_match(rwmol_from_peptide, NNAA_row['MOL']):
481
+ nnaa_name = NNAA_row['ID']
482
+ seen_times = visited_NNAA_labels.count(nnaa_name)
483
+ nnaa_prop = f"{nnaa_name}:{seen_times}"
484
+ mol = relabel_prop(mol, label, nnaa_prop)
485
+ visited_NNAA_labels.append(nnaa_name)
486
+ break
487
+ except:
488
+ continue
489
+ return mol
490
+
491
+ def different_NNAA(label, Unk_label):
492
+ return label != Unk_label
493
+
494
+
495
+ def relabel_prop(mol, label, nnaa_name):
496
+ for atom_idx in range(mol.GetNumAtoms()):
497
+ try:
498
+ atom = mol.GetAtomWithIdx(atom_idx)
499
+ if atom.HasProp("AA") and atom.GetProp("AA") == label:
500
+ atom.SetProp("AA", nnaa_name)
501
+ except:
502
+ continue
503
+ return mol
504
+
505
+
506
+ def perfect_match(rwmol_NNAA, nnaa_mol):
507
+ return (
508
+ rwmol_NNAA.HasSubstructMatch(nnaa_mol, useChirality=True)
509
+ and nnaa_mol.GetNumAtoms() == rwmol_NNAA.GetNumAtoms()
510
+ )
511
+
512
+ def NNAAs_with_OH_removed(NNAA_df):
513
+ new_rows = [] # List to store the new rows
514
+
515
+ for _, row in NNAA_df.iterrows():
516
+ mol = row['MOL']
517
+ rwmol_NNAA = Chem.RWMol(mol)
518
+ backbone_indices = rwmol_NNAA.GetSubstructMatches(loose_backbone)
519
+
520
+ for backbone_index in backbone_indices:
521
+ OH_atom_i = backbone_index[OH_position]
522
+ rwmol_NNAA.GetAtomWithIdx(OH_atom_i).SetProp("ToBeRemoved", "ToBeRemoved")
523
+
524
+ num_atoms = rwmol_NNAA.GetNumAtoms() - 1
525
+ while num_atoms >= 0:
526
+ if rwmol_NNAA.GetAtomWithIdx(num_atoms).HasProp("ToBeRemoved"):
527
+ rwmol_NNAA.RemoveAtom(num_atoms)
528
+ result_mol = rwmol_NNAA.GetMol()
529
+
530
+ # Add a new row to new_rows with the same data except for the modified 'MOL'
531
+ new_row = row.copy()
532
+ new_row['MOL'] = result_mol
533
+ new_rows.append(new_row)
534
+
535
+ num_atoms -= 1
536
+ num_atoms -= 1
537
+
538
+ # Convert new_rows to a DataFrame and concatenate with the original NNAA_df
539
+ new_rows_df = pd.DataFrame(new_rows)
540
+ NNAA_df = pd.concat([NNAA_df, new_rows_df], ignore_index=True)
541
+
542
+ return NNAA_df
543
+
544
+ def remove_small_substructs(mol):
545
+ substructures = Chem.GetMolFrags(mol, asMols=True)
546
+ if len(substructures) <= 1:
547
+ return mol, False
548
+ else:
549
+ error = "Multiple substructures. Removing the smaller ones."
550
+ substructure_sizes = [sub.GetNumAtoms() for sub in substructures]
551
+ largest_substructure_index = substructure_sizes.index(max(substructure_sizes))
552
+ for i in range(len(substructures)):
553
+ if i != largest_substructure_index:
554
+ modified_mol = Chem.DeleteSubstructs(mol, substructures[i])
555
+ return modified_mol, error
556
+
557
+ def has_unlabelled_atom(mol, seq_list):
558
+ if "?" in seq_list:
559
+ return True
560
+ for atom in mol.GetAtoms():
561
+ if not atom.HasProp("AA"):
562
+ return True
563
+ return False
564
+
565
+ def linear(peptide_bonds, aminos):
566
+ return len(peptide_bonds) == len(aminos) - 1
567
+
568
+ def ter_in_the_middle(seq_list):
569
+ for i, amino in enumerate(seq_list):
570
+ if amino.endswith("ter") and i != 0 and i != len(seq_list) - 1:
571
+ return True
572
+
573
+ def filter_out(seq_list, mol, peptide_bonds):
574
+ if not linear(peptide_bonds, seq_list):
575
+ return "Not linear"
576
+ if has_unlabelled_atom(mol, seq_list):
577
+ return "Has unlabelled atom"
578
+ if ter_in_the_middle(seq_list):
579
+ return "Terminal amino acid in the middle"
580
+ return False
581
+
582
+ def record_bond_sites(NNAA):
583
+ indices = []
584
+ for atom in NNAA.GetAtoms():
585
+ if atom.HasProp("bond_site"):
586
+ indices.append(atom.GetIdx())
587
+ return indices
588
+
589
+ def count_aminos(split_seq, NNAA_counts):
590
+ for amino in split_seq:
591
+ # Count the number of times each NNAA is seen in the output sequences
592
+ if amino.startswith("X"):
593
+ if amino in NNAA_counts:
594
+ NNAA_counts[amino] += 1
595
+ else:
596
+ NNAA_counts[amino] = 1
597
+ return NNAA_counts
598
+
599
+ def load_data(input_file):
600
+ # Load the data
601
+ print("0/4 Loading input data...")
602
+ df = pd.read_csv(input_file, sep='\t', on_bad_lines='warn')
603
+
604
+ # Check if the 'ID' column exists
605
+ if 'ID' not in df.columns:
606
+ df['ID'] = range(1, len(df) + 1) # Create an 'ID' column with unique sequential numbers
607
+
608
+ # Check if the 'ISOSMILES' column exists
609
+ if 'ISOSMILES' not in df.columns:
610
+ df['ISOSMILES'] = None # Create an empty 'ISOSMILES' column if it doesn't exist
611
+
612
+ # Check if the 'SMILES' column exists
613
+ if 'SMILES' not in df.columns:
614
+ df['SMILES'] = None # Create an empty 'SMILES' column if it doesn't exist
615
+
616
+ # Determine which column to use for the SMILES
617
+ df['SMILES'] = df['ISOSMILES'].fillna(df['SMILES']).str.strip()
618
+
619
+ # Remove rows where both 'ISOSMILES' and 'SMILES' are missing or empty
620
+ df = df[df['SMILES'].ne("")]
621
+
622
+ # Drop ISOSMILES column
623
+ df = df.drop(columns=['ISOSMILES'])
624
+
625
+ # drop rows where 'SMILES' is empty
626
+ df = df[(df['SMILES'] != '') & (df['SMILES'].notna())]
627
+
628
+ # convert to a dataframe
629
+ df = pd.DataFrame(df)
630
+ return df
631
+
632
+ def process_molecule_batch(batch_df, smi2mol, ignore_cyclic_peptide, min_amino_acids, progress_bar):
633
+ local_mol_data = []
634
+
635
+ for mol_index, row in batch_df.iterrows():
636
+ try:
637
+ smi = row['SMILES']
638
+ if not smi:
639
+ local_mol_data.append((mol_index, None, None, "No SMILES provided", None, None))
640
+ continue
641
+
642
+ mol = Chem.MolFromSmiles(smi)
643
+ if mol is None:
644
+ local_mol_data.append((mol_index, None, None, "Invalid SMILES", None, None))
645
+ continue
646
+
647
+ mol, error = remove_small_substructs(mol)
648
+ if error:
649
+ local_mol_data.append((mol_index, None, None, error, None, None))
650
+ continue
651
+
652
+ match_AA(mol, smi2mol)
653
+ peptide_bonds = label_peptide_bonds(mol)
654
+
655
+ if len(peptide_bonds) < min_amino_acids - 1:
656
+ local_mol_data.append((mol_index, None, None, "Not enough amino acids", None, None))
657
+ continue
658
+
659
+ num_NNAAs = label_NNAAs(mol, peptide_bonds)
660
+ all_AA = num_NNAAs == 0
661
+
662
+ label_boundary_bonds(mol)
663
+
664
+ if mol_is_cyclic_peptide(mol, ignore_cyclic_peptide):
665
+ local_mol_data.append((mol_index, None, None, "Cyclic peptide", None, None))
666
+ continue
667
+
668
+ NNAAs_info = []
669
+ if not all_AA:
670
+ NNAAs = get_NNAAs(mol)
671
+ if NNAAs == "error":
672
+ local_mol_data.append((mol_index, None, None, "Disconnected molecule", None, None))
673
+ continue
674
+ else:
675
+ for NNAA in NNAAs:
676
+ ter_or_not = detect_terminal(NNAA)
677
+ bond_sites = record_bond_sites(NNAA)
678
+ NNAAs_info.append((NNAA, ter_or_not, bond_sites))
679
+
680
+ local_mol_data.append((mol_index, mol, all_AA, None, peptide_bonds, NNAAs_info))
681
+
682
+ except:
683
+ local_mol_data.append((mol_index, None, None, "Unknown error", None, None))
684
+
685
+ progress_bar.update(1)
686
+ return local_mol_data
687
+
688
+ def label_molecules_in_batches(mol_df, batch_size, smi2mol, ignore_cyclic_peptide, min_amino_acids, max_workers):
689
+ # Initialize columns and dataframes
690
+ mol_df[['ERROR', 'MOL', 'ALL AA', 'PEPTIDE BONDS']] = ["", "", False, ""]
691
+ NNAA_df = pd.DataFrame(columns=['ID', 'SMILES', 'TERMINAL', 'BOND SITES'])
692
+
693
+ indices = list(mol_df.index)
694
+ batches = [indices[i:i + batch_size] for i in range(0, len(indices), batch_size)]
695
+ futures = []
696
+ progress_bar = tqdm(total=len(indices) // batch_size, desc="1/4 Labelling molecules", leave=True)
697
+
698
+ # Use ThreadPoolExecutor for parallel batch processing
699
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
700
+ for batch_indices in batches:
701
+ batch_df = mol_df.loc[batch_indices]
702
+ futures.append(
703
+ executor.submit(process_molecule_batch, batch_df, smi2mol, ignore_cyclic_peptide, min_amino_acids, progress_bar)
704
+ )
705
+
706
+ progress_bar.close()
707
+
708
+ with tqdm(total=len(mol_df), desc="2/4 Storing NNAAs") as pbar:
709
+ for future in as_completed(futures):
710
+ batch_results = future.result()
711
+
712
+ for mol_index, mol, all_AA, error, peptide_bonds, NNAAs_info in batch_results:
713
+ if mol is None:
714
+ mol_df.at[mol_index, 'ERROR'] = error
715
+ continue
716
+
717
+ mol_df.at[mol_index, 'MOL'] = mol
718
+ mol_df.at[mol_index, 'ALL AA'] = all_AA
719
+ mol_df.at[mol_index, 'PEPTIDE BONDS'] = peptide_bonds
720
+
721
+ if NNAAs_info:
722
+ for NNAA, ter_or_not, bond_sites in NNAAs_info:
723
+ NNAA_df = enlist_NNAA(NNAA, NNAA_df, ter_or_not, bond_sites)
724
+
725
+ pbar.update(len(batch_results))
726
+
727
+ return NNAA_df, mol_df
728
+
729
+ def highlight_bonds_with_AA(mol_s): # with AA colors
730
+ bond_highlights = defaultdict(lambda: [])
731
+ for bond in mol_s.GetBonds():
732
+ atom1_i, atom2_i, prop1, prop2 = get_connected_atoms_and_props(bond, mol_s)
733
+ if (label_belongs_to_AA(prop1) and prop1 == prop2): # if the bond is within the same AA
734
+ bond_highlights[bond.GetIdx()].append(aa2color_dict[prop1[:3]])
735
+ return bond_highlights
736
+
737
+
738
+ def relabel_batch(mol_df, NNAA_df):
739
+ # Initialize a list to collect row data
740
+ local_mol_data = []
741
+
742
+ for _, row in mol_df.iterrows():
743
+ mol_index = row['ID']
744
+ mol = row['MOL']
745
+ all_AA = row['ALL AA']
746
+ peptide_bonds = row['PEPTIDE BONDS']
747
+
748
+ try:
749
+ # Process molecule if not all amino acids are labeled
750
+ if not all_AA:
751
+ mol = relabel_NNAA(mol, NNAA_df)
752
+
753
+ # Perform various processing tasks
754
+ bond_highlights = highlight_bonds_with_AA(mol)
755
+ peptide_bonded_props, peptide_bonded_atoms = search_terminal_AA(mol)
756
+ first_atom_index = get_first_atom_index(mol, peptide_bonded_props, peptide_bonded_atoms)
757
+ aa_list = reorder_AAs(mol, first_atom_index)
758
+ split_seq = write_seq(aa_list)
759
+ seq = "".join(split_seq)
760
+
761
+ error = filter_out(split_seq, mol, peptide_bonds)
762
+
763
+ if error:
764
+ seq = ""
765
+
766
+ except Exception as e:
767
+ error = str(e) # Ensure error is a string
768
+ seq = ""
769
+
770
+ # Collect data in a list of dictionaries
771
+ local_mol_data.append({'ID': mol_index, 'SEQUENCE': seq, 'ERROR': error, 'BOND HIGHLIGHTS': bond_highlights})
772
+
773
+ return pd.DataFrame(local_mol_data)
774
+
775
+ def relabel_batches(mol_df, NNAA_df, batch_size):
776
+ # Check if NNAA_df is empty
777
+ if NNAA_df.empty:
778
+ print("Warning: NNAA_df is empty. No NNAAs to process.")
779
+
780
+ # Ensure NNAA_df has an index
781
+ if NNAA_df.index.empty:
782
+ NNAA_df = NNAA_df.reset_index(drop=True)
783
+ mol_df['BOND HIGHLIGHTS'] = ""
784
+ mol_df_copy = mol_df[mol_df['MOL'] != ""].copy()
785
+ indices = list(mol_df_copy.index)
786
+
787
+ def process_batch(batch_indices):
788
+ batch_df = mol_df_copy.loc[batch_indices]
789
+ return relabel_batch(batch_df, NNAA_df)
790
+
791
+ with ThreadPoolExecutor() as executor:
792
+ futures = []
793
+ for i in range(0, len(indices), batch_size):
794
+ batch_indices = indices[i:i + batch_size]
795
+ futures.append(executor.submit(process_batch, batch_indices))
796
+
797
+ local_mol_df = mol_df.copy()
798
+
799
+ for future in tqdm(as_completed(futures), total=len(futures), desc="4/4 Relabelling mols"):
800
+ mol_dataset_per_batch = future.result()
801
+
802
+ for _, row in mol_dataset_per_batch.iterrows():
803
+ local_mol_df.loc[local_mol_df['ID'] == row['ID'], ['SEQUENCE', 'ERROR', 'BOND HIGHLIGHTS']] = row[['SEQUENCE', 'ERROR', 'BOND HIGHLIGHTS']].values
804
+
805
+ return local_mol_df
806
+
807
+
808
+
809
+ def output_NNAA(NNAA_df, output_dir):
810
+ # Drop the 'MOL' column
811
+ NNAA_df = NNAA_df.drop(columns=['MOL'])
812
+ NNAA_df['TAUTOMERS'] = None
813
+
814
+ # Add 'COUNT' by 'TAUTOMER HASH' group and deduplicate by 'TAUTOMER HASH'
815
+ NNAA_df = NNAA_df.groupby('TAUTOMER HASH').agg(
816
+ ID=('ID', 'first'),
817
+ SMILES=('SMILES', 'first'),
818
+ TAUTOMERS=('SMILES', lambda x: ','.join(x.unique())),
819
+ TERMINAL=('TERMINAL', 'first'),
820
+ BOND_SITES=('BOND SITES', 'first'),
821
+ ).reset_index().drop_duplicates(subset='TAUTOMER HASH', keep='first')
822
+
823
+ NNAA_df = NNAA_df.drop(columns=['TAUTOMER HASH'])
824
+
825
+ print(output_dir)
826
+
827
+ NNAA_df.to_csv(os.path.join(output_dir, "raw/ncAAs_raw.txt"), sep='\t', index=False)
828
+
829
+
830
+ def output_mols(mol_df, output_dir, draw):
831
+ if draw:
832
+ drawer = MoleculeDrawer(output_dir)
833
+
834
+ def safe_draw(row):
835
+ try:
836
+ drawer.draw_input_mol(row['MOL'], row['ID'], row['SEQUENCE'], row['BOND HIGHLIGHTS'])
837
+ except Exception as e:
838
+ return None # Return None to effectively ignore this row
839
+
840
+ # Apply the safe drawing function to each row
841
+ mol_df.apply(lambda row: safe_draw(row), axis=1)
842
+
843
+ mol_df.drop(columns=['MOL', 'PEPTIDE BONDS'], inplace=True)
844
+
845
+ # bring 'SEQUENCE' column next to 'ID'
846
+ cols = ['ID', 'SEQUENCE'] + [col for col in mol_df.columns if col not in ['ID', 'SEQUENCE']]
847
+ mol_df = mol_df[cols]
848
+
849
+ mol_df.to_csv(os.path.join(output_dir, "raw/sequences_raw.txt"), sep='\t', index=False)
850
+
851
+ def get_rdkit_tautomer_hash(smi):
852
+ mol = Chem.MolFromSmiles(smi)
853
+ if mol is None:
854
+ return None
855
+ layers = RegistrationHash.GetMolLayers(mol)
856
+ return layers[HashLayer.TAUTOMER_HASH]
857
+
858
+ def main():
859
+ mol_df = load_data(input_file)
860
+ NNAA_df, mol_df = label_molecules_in_batches(mol_df, batch_size, smi2mol, ignore_cyclic_peptide, min_amino_acids, max_workers)
861
+ NNAA_df['TAUTOMER HASH'] = NNAA_df['SMILES'].apply(get_rdkit_tautomer_hash)
862
+ NNAA_df = NNAAs_with_OH_removed(NNAA_df)
863
+ NNAA_df = add_IDs(NNAA_df)
864
+ mol_df = relabel_batches(mol_df, NNAA_df, batch_size)
865
+ output_NNAA(NNAA_df, output_dir)
866
+ output_mols(mol_df, output_dir, draw)
867
+
868
+ if __name__ == '__main__':
869
+ args = parse_arguments()
870
+
871
+ input_file = args.input_file
872
+ ignore_cyclic_peptide = not args.process_cyclic
873
+ min_amino_acids = args.min_amino_acids
874
+ batch_size = args.batch_size
875
+ output_dir = args.output_dir
876
+ max_workers = args.max_workers
877
+ draw = args.draw
878
+
879
+ os.makedirs(output_dir, exist_ok=True)
880
+ os.makedirs(os.path.join(output_dir, "raw"), exist_ok=True)
881
+
882
+ main()
src/prepare_GPepT_data.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import argparse
3
+ import pandas as pd
4
+ import os
5
+
6
+ # Set up argument parser
7
+ parser = argparse.ArgumentParser(description="Process sequences from an input file and split them into two output files.")
8
+ parser.add_argument('--output_dir', type=str, default='output/tmp', help="Directory containing the input file")
9
+ args = parser.parse_args()
10
+
11
+ # Define input file and output file paths
12
+ input_file = os.path.join(args.output_dir, 'standard/sequences_standardized.txt')
13
+ os.makedirs(os.path.join(args.output_dir, 'for_GPepT'), exist_ok=True)
14
+ output_file_90 = os.path.join(args.output_dir, 'for_GPepT/train90.txt')
15
+ output_file_10 = os.path.join(args.output_dir, 'for_GPepT/val10.txt')
16
+
17
+ # Check if the input file exists
18
+ if not os.path.exists(input_file):
19
+ # No ncAAs?
20
+ input_file = os.path.join(args.output_dir, 'raw/sequences_raw.txt')
21
+ if not os.path.exists(input_file):
22
+ print(f"Error: The input file '{input_file}' does not exist.")
23
+ exit(1)
24
+
25
+ # Read the input file into a pandas DataFrame
26
+ df = pd.read_csv(input_file, sep='\t')
27
+
28
+ # Extract sequences and add <endoftext> to each
29
+ sequences = df['SEQUENCE'].apply(lambda x: x + '<|endoftext|>')
30
+
31
+ # Shuffle the sequences to randomize the split
32
+ sequences = sequences.sample(frac=1, random_state=42).reset_index(drop=True)
33
+
34
+ # Split the sequences into 90% and 10%
35
+ split_index = int(0.9 * len(sequences))
36
+ sequences_90 = sequences[:split_index]
37
+ sequences_10 = sequences[split_index:]
38
+
39
+ # Write the sequences to the output files
40
+ sequences_90.to_csv(output_file_90, index=False, header=False)
41
+ sequences_10.to_csv(output_file_10, index=False, header=False)
42
+
43
+ print(f"Data has been successfully split into {output_file_90} and {output_file_10}")
src/standardizer.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import pandas as pd
4
+ import re
5
+ import argparse
6
+ import os
7
+
8
+ def parse_arguments():
9
+ parser = argparse.ArgumentParser(description="Standardize non-canonical amino acids (ncAAs) and sequences.")
10
+ parser.add_argument("--output_dir", default='output/tmp', help="Directory to save output files.")
11
+ return parser.parse_args()
12
+
13
+ def main():
14
+ args = parse_arguments()
15
+
16
+ output_dir = args.output_dir
17
+
18
+ # Ensure output directory exists
19
+ os.makedirs(output_dir, exist_ok=True)
20
+ os.makedirs(os.path.join(output_dir, 'standard'), exist_ok=True)
21
+
22
+ # Paths for input files
23
+ standard_ncAAs_file = 'dictionary.txt'
24
+ raw_ncAAs_file = os.path.join(output_dir, 'raw/ncAAs_raw.txt')
25
+ sequence_file = os.path.join(output_dir, 'raw/sequences_raw.txt')
26
+
27
+ # Paths for output files
28
+ id_mapping_output = os.path.join(output_dir, 'nc_raw2standard.txt')
29
+ relabeled_ncAAs_output = os.path.join(output_dir, 'standard/nc_standardized.txt')
30
+ relabeled_sequence_output = os.path.join(output_dir, 'standard/sequences_standardized.txt')
31
+
32
+ try:
33
+
34
+ # Load the analysis DataFrame
35
+ standard_ncAAs = pd.read_csv(standard_ncAAs_file, sep='\t')
36
+ raw_ncAAs = pd.read_csv(raw_ncAAs_file, sep='\t')
37
+
38
+ # Remove rows whose 'ID' does not start with 'X'
39
+ raw_ncAAs = raw_ncAAs[raw_ncAAs['ID'].str.startswith('X')]
40
+
41
+ # Dictionary to store old and new IDs
42
+ id_map = {}
43
+
44
+ # Function to relabel IDs of the current_ncAAs DataFrame according to the standard_ncAAs DataFrame ID with the same SMILES
45
+ def relabel_id(row):
46
+ old_id = row['ID']
47
+ # Find the row in standard_ncAAs with the same SMILES
48
+ match = standard_ncAAs[standard_ncAAs['SMILES'] == row['SMILES']]
49
+ if not match.empty:
50
+ new_id = match['ID'].values[0]
51
+ id_map[old_id] = new_id # Record old and new ID mapping
52
+ return new_id
53
+ else:
54
+ return "[UNK]"
55
+
56
+ # Apply the function to relabel IDs and store old-new ID mappings
57
+ raw_ncAAs['ID'] = raw_ncAAs.apply(relabel_id, axis=1)
58
+
59
+ # Save the ID mapping
60
+ id_map_df = pd.DataFrame(list(id_map.items()), columns=['raw_ID', 'standard_ID'])
61
+ id_map_df.to_csv(id_mapping_output, sep='\t', index=False)
62
+
63
+ raw_ncAAs.to_csv(relabeled_ncAAs_output, sep='\t', index=False)
64
+
65
+ # Load the sequence file
66
+ sequence_df = pd.read_csv(sequence_file, sep='\t')
67
+
68
+ # Drop rows whose 'SEQUENCE' is NaN
69
+ sequence_df = sequence_df.dropna(subset=['SEQUENCE'])
70
+
71
+ # Function to apply the relabeling in the SEQUENCE column
72
+ def relabel_sequence(sequence):
73
+ # Split the sequence by capital letters, which separates each ID
74
+ tokens = re.split(r"(?=[A-Z])", sequence)
75
+ # Replace each token if it matches an old ID in the map
76
+ relabeled_tokens = [id_map.get(token, token) for token in tokens]
77
+ # If '[NA]' is in the relabeled tokens, return an empty string
78
+ if '[UNK]' in relabeled_tokens:
79
+ return ''
80
+ # Reassemble the sequence
81
+ return ''.join(relabeled_tokens)
82
+
83
+ # Apply relabeling to each sequence
84
+ sequence_df['SEQUENCE'] = sequence_df['SEQUENCE'].apply(relabel_sequence)
85
+
86
+ # Save the relabeled sequences
87
+ sequence_df.to_csv(relabeled_sequence_output, sep='\t', index=False)
88
+
89
+ print("Relabeling complete.")
90
+ print(f"ID mapping saved to: {id_mapping_output}")
91
+ print(f"Relabeled ncAAs saved to: {relabeled_ncAAs_output}")
92
+ print(f"Relabeled sequences saved to: {relabeled_sequence_output}")
93
+
94
+ except Exception as e:
95
+ print(f"No ncAAs found.")
96
+
97
+ if __name__ == "__main__":
98
+ main()