Spaces:

Playingyoyo
/

Monomerizer

Running

App Files Files Community

Playingyoyo commited on Feb 7

Commit

084b58f

1 Parent(s): 47e3259

Initial Monomerizer Space

Browse files

Files changed (15) hide show

GPepT_analysis_pipeline.py +59 -0
demo/example.svg +0 -0
demo/example_GPepT_generated_sequences.txt +9 -0
demo/example_smiles.txt +101 -0
demo/example_smiles_IDs.txt +101 -0
dictionary.txt +0 -0
requirements.txt +7 -0
run_pipeline.py +57 -0
src/analyse.py +128 -0
src/demonomerizer.py +211 -0
src/draw.py +67 -0
src/monomer_analyzer.py +1 -0
src/monomerizer.py +882 -0
src/prepare_GPepT_data.py +43 -0
src/standardizer.py +98 -0

GPepT_analysis_pipeline.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import argparse
+import os
+import subprocess
+import sys
+import datetime
+def run_pipeline(sequence_file, output_dir, demonomerized_file, demonomerizer_args=None, analyse_args=None):
+    os.makedirs(output_dir, exist_ok=True)
+    # Step 1: Run demonomerizer.py
+    print(f"Running demonomerizer.py... Input: {sequence_file}")
+    demonomerizer_command = [
+        sys.executable, "src/demonomerizer.py",
+        "--sequence_file", sequence_file,
+        "--NNAA_file", "dictionary.txt",
+        "--batch_size", "8",
+        "--output_dir", output_dir,
+        "--demonomerized_file", demonomerized_file
+    ]
+    subprocess.run(demonomerizer_command, check=True)
+    demonomerized_path = os.path.join(output_dir, demonomerized_file)
+    # Step 2: Run analyse.py
+    print("Running analyse.py...")
+    analyse_command = [
+        sys.executable, "src/analyse.py",
+        "--mols_file", demonomerized_path,
+        "--input_dir", output_dir,
+        "--target_type", "peptides",
+    ]
+    if analyse_args:
+        analyse_command.extend(analyse_args)
+    subprocess.run(analyse_command, check=True)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the demonomerizer pipeline.")
+    parser.add_argument("--sequence_file", default="demonomerized.txt", help="Input sequence file")
+    parser.add_argument("--output_dir", default=f"output/{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}", help="Directory to store output")
+    parser.add_argument("--demonomerized_file", default="sequences_standardized.txt", help="Output demonomerized file name")
+    parser.add_argument("--batch_size", type=int, default=8, help="Batch size for demonomerizer.py")
+    parser.add_argument("-fetch_names", action="store_true", help="Fetch names from PubChem in analyse.py")
+    parser.add_argument("--target_type", default="ncAAs", help="Target type: ncAAs or peptides")
+    args = parser.parse_args()
+    # Args for demonomerizer
+    demonomerizer_args = ["--NNAA_file", "dictionary.txt", "--batch_size", str(args.batch_size)]
+    # Args for analyse
+    analyse_args = []
+    if args.fetch_names:
+        analyse_args.append("-fetch_names")
+    if args.target_type:
+        analyse_args.extend(["--target_type", args.target_type])
+    run_pipeline(args.sequence_file, args.output_dir, args.demonomerized_file, demonomerizer_args, analyse_args)

demo/example.svg ADDED Viewed

demo/example_GPepT_generated_sequences.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+SEQUENCE
+X7681VZ81
+X1132RZ0
+X369X2326Z0
+X72AZ4941
+X183PLGPGZ421
+X2954AZ88
+X34X6765X5Z11
+X47WX47LFKKIGAVLKVLZ0

demo/example_smiles.txt ADDED Viewed

	@@ -0,0 +1,101 @@

+SMILES
+CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)NO)NC(=O)OCc1ccccc1)C(C)C)[C@@H](O)CC(=O)NCCc1ccccc1
+N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1cc(I)c(O)c(I)c1)C(N)=O
+NC(=O)[C@@H]1C[C@H](NC(=O)C(F)(F)F)CN1C(=O)[C@H](N)CCC/N=C(\N)N[N+](=O)[O-]
+CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CCCNC(=N)N)NC(=O)OCc1ccccc1)[C@@H](O)CC(=O)NC1CCCCC1
+CC(C)C[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](N)Cc1ccccc1)C(=O)NCC(=O)N1CCC[C@H]1C(=O)NCC(=O)Nc1ccc(N(CCCl)CCCl)cc1
+C[C@H](NC(=O)[C@@H](CO)NS(=O)(=O)c1ccccc1)C(=O)N[C@H]1CCCN(C(=N)N)C1O
+C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](c2csc(-c3ccccc3)n2)CN1C(=O)[C@@H](NC(=O)OC1CCCC1)C(C)(C)C)C(=O)NS(=O)(=O)C1CC1
+CC[C@H](C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCC[N+](C)(C)C)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](N)CCCC[N+](C)(C)C)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
+CSCC[C@H](NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](C)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H](N)CC(C)C)[C@@H](C)O)C(=O)O
+CSCC[C@H](NC(=O)[C@H](Cc1cnc[nH]1)NC(=O)[C@H](CO)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H](N)Cc1cnc[nH]1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)O
+CC(C)(C)NC(=O)C1(C2CCCCC2)CCN(C(=O)[C@@H](Cc2ccc(F)cc2)NC(=O)[C@@H]2CNC3(CC3)CN2)CC1
+CC(=O)O[C@H]1C(=O)[C@@]2(C)[C@H]([C@H](OC(=O)c3ccccc3)[C@]3(O)CC(OC(=O)[C@H](OC(=O)NCCNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc4ccccc4)NC(=O)[C@@H](C)N)C(NC(=O)c4ccccc4)c4ccccc4)C(C)=C1C3(C)C)[C@]1(OC(C)=O)CO[C@@H]1C[C@@H]2O
+COC(=O)[C@@H](Cc1ccccc1)NC(=O)[C@H](NC(=O)CC(O)CC(CC(C)C)NC(=O)[C@@H](N)Cc1c[nH]cn1)C(C)C
+CC[C@H](C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](N)CCCC[N+](C)(C)C)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C(C)C)C(O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCC[N+](C)(C)C)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C
+CC1OC(SCCCCCCNC(=O)[C@H](CCC(N)=O)NC(=O)[C@@H](C)N)C(O)C(O)C1O
+CC1(C)N([O])C(c2ccc(OCC(=O)NCCCNC(=O)[C@H](Cc3ccc(O)cc3)NC(=O)[C@@H](N)CCCNC(=N)N)cc2)=[N+]([O-])C1(C)C
+CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CSCCC[P+](C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CSCCC[P+](C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)CSCCC[P+](C)(C)C)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(N)=O
+CCN(CC)CCC(=O)NC(C(=O)N[C@H](Cc1cn(C=O)c2ccccc12)C(=O)N[C@@H](Cc1ccccc1)C(=O)N(C)Cc1ccccc1)C(C)O
+N[C@@H](Cc1cc2ccccc2[nH]1)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)NCCCC(=O)NCC(=O)NCCCCCCOP(=O)(O)Oc1ccccc1Cl
+C[C@@H](N)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCCCN)C(=O)NCCCCC(=O)OCCNc1nc(NCCc2ccccc2)c2cnn(/C=C/c3ccccc3)c2n1
+CCCC[PH](CCCC)(CCCC)Cc1ccc(NC(=O)C2Cc3ccccc3CN2C(=O)[C@@H](N)CCc2ccccc2)cc1
+CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)CCCCN)C(=O)N[C@@H](CCCCN)C(=O)O
+Cc1cc(C(=O)N[C@H](C(=O)N[C@@H](Cc2ccc(F)cc2)C(=O)N[C@@H](/C=C/C(=O)OCc2nc3cc(Cl)ccc3[nH]2)CCC(N)=O)C(C)C)no1
+CCN(CC)c1ccc2c(-c3ccc(S(=O)(=O)NCCCC[C@H](NC(=O)Cc4csc(=N)n4C)C(=O)N[C@@H](Cc4cn(Cc5ccccc5)c[n+]4C)C(=O)NC4CCN(C)CC4)cc3S(=O)(=O)[O-])c3ccc(=[N+](CC)CC)cc-3oc2c1
+CC(C)C[C@H](N)C(=O)NCC(=O)N1CCC[C@H]1C(=O)NCC(=O)Nc1ccc(N(CCCl)CCCl)cc1
+CC1=CC(C)=[N+]2C1=Cc1ccc(CCC(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(N)=O)n1[B-]2(F)F
+N=C(N)NCCC[C@@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](CC(=O)O)NC(=O)[C@@H](CO)NC(=O)[C@@H](N)CC(=O)O)C(=O)O
+CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@H]2CCCN2C[C@H]1C(=O)N[C@@H]1CCOc2ccccc21)C1CCC(F)(F)CC1
+COc1ccc(NC(=O)[C@@H]2Cc3ccc(OCC(=O)NO)cc3CN2C(=O)[C@H](C)N)cc1
+COc1cccc(COc2ccc([C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCNC(=N)N)NC(=O)c3cccs3)C(N)=O)cc2)c1
+CC(C)CC(N)C(=O)NCC(=O)NC[C@H](C)B1OC2CC3CC(C3(C)C)[C@@]2(C)O1
+CCN(CC)CCNC(=O)c1ccc(C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C=O)CC(C)C)c(NNN2CCCC2)c1
+CC(O)C(NC(=O)CCN)C(=O)N[C@H](Cc1cn(C=O)c2ccccc12)C(=O)N[C@@H](Cc1ccccc1)C(=O)N(C)Cc1ccccc1
+CCCN(CC(=O)N[C@H](C=O)CCCN=C(N)N)C(=O)[C@H]1CCCCN1
+CC(C)C[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(C)C)C(C)C)C(=O)N[C@@H](CO)C(=O)O
+CC(C)(N)C(=O)N[C@H](CCCc1ccccc1)C(=O)N1CCC2(CC1)CC(O)c1ccccc12
+CC[C@H](C)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](N)CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CSc1ccc2n1[B-](F)(F)[N+]1=CC=CC1=C2)C(=O)O
+O=C(NCCCCC[C@H](NC(=O)[C@@H]1C[C@@H](N2CCCCC2)CN1C(=O)[C@@H](CC1CCCCC1)NC(=O)c1ccc2ccccc2c1)B(O)O)NC1CCCCC1
+CC(=O)N[C@@H](CCCO/N=C/c1ccccc1)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCC(=O)O)C(N)=O
+COc1ccc(CC(C)(NC(=O)[C@@H]2CCCN2C(=O)CCCc2ccc(O)cc2)C(=O)NCCCN)cc1OC
+N=C(N)N1CCC(C(NS(=O)(=O)Cc2ccccc2)C(=O)NCC(=O)N[C@H]2CCCN(C(=N)N)C2O)CC1
+CCC(=O)NCCOCCOCCNC(=O)/N=C(\N)NCCC[C@H](NC(=O)[C@@H](N)CC(=O)O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1cnc[nH]1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(=O)O)[C@@H](C)CC)C(C)C
+N=C(N)c1ccc(CNC(=O)[C@@H]2CCCN2C(=O)[C@H](N)C2CCCCC2)cc1
+CC(C)C[C@H](NC(=O)CNC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)NCC(N)=O
+CCCCCCCC(=O)OC[C@H](NC(=O)C(C)(C)N)C(=O)N1CCC2(CC1)CN(S(C)(=O)=O)c1ccccc12
+COc1ccc(NC(C)=O)cc1C(=O)NNC(=O)[C@H](CCCCN)NC(=O)CCOC[C@H]1OC(OCCCNC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)OC(C)(C)C)C(C)C)[C@H](O)[C@@H](O)[C@@H]1O
+CSCC[C@H](NC(=O)[C@@H]1Cc2ccccc2CN1)C(=O)NO
+CSCC[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(C)=O)C(=O)NCC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](COS(=O)(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(N)=O
+C=Cc1c(C)c2cc3nc(c4c5[nH]c(cc6nc(cc1[nH]2)C(C)=C6CC)c(C)c5C(=O)C4)[C@@H](CCC(=O)N[C@H](C(=O)N[C@@H](CO)C(=O)NCC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCCCNOC(=O)CCCN(C)c1ccc(/N=N/c2cccc4nc5ccc(N(CC)CC)cc5[n+](-c5ccccc5)c24)cc1)C(N)=O)[C@@H](C)O)[C@@H]3C
+CC(=O)NC(Cc1ccc([N+](=O)[O-])cc1)C(=O)NCC(N)C(=O)c1ccccc1
+CC(C)(C)[C@H](NC(=O)Cc1cc(Cl)cc(Cl)c1)C(=O)NCC(=O)NC/C=C/S(C)(=O)=O
+CC(C)CCOc1ccc2ccccc2c1-c1c(OCC(=O)N[C@H](CCCCN)C(=O)N[C@H](CCCN)C(=O)N[C@@H](CC(C)C)C(=O)OCc2ccccc2)ccc2ccccc12
+CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)c1[nH]c2ccc(OCCCCCN)cc2c1CCCCCCN)C(=O)N[C@@H](CC(C)C)C(=O)O
+CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C)C(C)C
+CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
+CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
+CC[C@H](C)[C@H](NC(=O)[C@@H](N)C(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N1CCC[C@H]1C(=O)N[C@@H](C)C(=O)NCC(=O)O
+CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C
+CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
+CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
+CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
+COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)C[C@@](O)(C(=O)CO)C[C@@H]3O[C@H]1C[C@H]2[C@H](OCN2C(=O)OCc2ccc(NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc3ccccc3)NC(=O)[C@@H](C)N)cc2)[C@H](C)O1
+CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)CC(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
+CC(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(N)=O)C(C)C)C(C)C
+CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)c1[nH]c2c(OCCCCCN=C(N)N)cccc2c1CCCCCCN=C(N)N)C(=O)N[C@@H](CC(C)C)C(=O)O
+CCN(CC(=O)NCC(=O)Nc1cccc(C)c1C)Cc1ccccc1
+CCn1c(-c2nonc2N)nc2c(C#CC(C)(C)O)ncc(OCCCNCCCC(=O)NCCOCCOCCOCCC(=O)N[C@H](C(=O)N3C[C@H](O)C[C@H]3C(=O)NCc3ccc(-c4scnc4C)cc3)C(C)(C)C)c21
+Cc1cc2c(s1)-n1c(C)nnc1[C@H](CC(=O)NCCOCCCOCC(=O)N[C@@H](C(=O)N1C[C@H](O)C[C@H]1C(=O)NCc1ccc(-c3scnc3C)cc1)C(C)(C)C)N=C2c1ccc(Cl)cc1
+NC(=O)[C@@H]1C[C@H](NC(=O)[C@H](N)CCC/N=C(\N)N[N+](=O)[O-])CN1
+CC(C)C[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)C[C@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1ccccc1)C(N)=O
+CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCCNC(=N)N)NC(=O)CC(c1ccccc1)c1ccccc1)[C@@H](C)O)[C@@H](C)CC)C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](Cc1cn(CCN2CCCc3cc(/C=C/C4=C(Br)C(/C=C/c5cc6c7c(c5)CCCN7CCC6)=[O+][B-](F)(C(F)(F)F)O4)ccc32)nn1)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O)[C@@H](C)CC)[C@@H](C)CC
+Cc1ncsc1-c1ccc(C2(NC(=O)[C@@H]3C[C@@H](O)CN3C(=O)[C@@H](NC(=O)[C@H]3CC4(C3)C[C@H](N3CCC(c5cnc(N6C7CCC6CN(c6cc(-c8ccccc8O)nnc6N)C7)nc5)CC3)C4)C(C)(C)C)CC2)cc1
+Cc1cc(C)c(CNC(=O)c2cc(-c3ccc(N4CCN(C(=O)CCCCCn5cc(CCCCCC(=O)N[C@@H](C(=O)N6C[C@H](O)C[C@H]6C(=O)NCc6ccc(-c7scnc7C)cc6)C(C)(C)C)nn5)CC4)nc3)cc3c2cnn3C(C)C)c(=O)[nH]1
+CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(C)=O)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(=O)O)[C@@H](C)CC
+CC[C@H](C)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](C)NC(=O)[C@@H](NC(=O)[C@H](CNC(=O)c1cccc(S(=O)(=O)F)c1)NC(C)=O)[C@@H](C)CC)C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(N)=O
+CC(C)[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](Cc1cccc(Cl)c1)NC(=O)C1CCN(C)CC1)C(=O)C(F)(F)F
+COc1ccc([C@H](NC(=O)[C@H](Cc2cccc(Cl)c2)NC(=O)c2ccc(CNC(=O)OC(C)(C)C)c(Cl)c2)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
+COc1ccc([C@H](NC(=O)[C@H](Cc2cccc(Cl)c2)NC(=O)c2cccc(OCC(=O)OC(C)(C)C)c2)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
+COc1ccc([C@H](NC(=O)[C@H](C)NC(=O)C(c2ccc(Cl)cc2)C(C)C)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
+CC(C)C[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CCCCN)NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CCCN1C(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](Cc1c[nH]cn1)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](N)CC(C)C)C(=O)O
+CSCC[C@H](NC(=O)[C@H](C)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCCCN)C(C)C)[C@@H](C)O)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CS)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](CC(C)C)C(=O)O
+CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](N)CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O
+Cc1oc2c(c(C)cc3oc(=O)c(CC(=O)NCC(=O)NCC(=O)NCC(C)O)c(C)c32)c1C
+CC[C@H](C)[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(N)=O)NC(C)=O)[C@@H](C)CC)[C@@H](C)CC)[C@@H](C)CC)[C@@H](C)CC)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(N)=O
+Cc1cc(Cn2c(N3CC4(CNC4)C3)nc3c(N4CCN(CCCC(=O)NCCCC(=O)N[C@H](C(=O)N5C[C@H](O)C[C@H]5C(=O)N[C@@H](C)c5ccc(-c6scnc6C)cc5)C(C)(C)C)CC4)cc(Cl)cc32)cc(C)c1F
+Cn1ccc(-c2cc(Cl)c(Cl)c3[nH]c4c(c23)CN(C(=O)CNC(=O)CN2CCNCC2)CC4)n1
+CN(C)CCCCCNC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](N)Cc1c[nH]cn1
+CN(Cc1ccc2ccccc2c1)C(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]1CCCN1/C(S)=N/Cc1ccccc1Cl
+CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)CNC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)N[C@@H](CCCN=C(N)N)C(=O)O
+CSCC[C@H](NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CO)NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@H](CCCN=C(N)N)C(=O)O
+O=C(N[C@H](CC1CCCCC1)C(=O)N1C[C@H](N2CCCCC2)C[C@H]1C(=O)N[C@@H](CCCCN1CC2(CSC2)C1)B(O)O)c1ccc2ccccc2c1
+N=C(N)NCCCC[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](Cc1ccccc1)NS(=O)(=O)Cc1ccccc1)C(=O)Cc1ccccc1
+COC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)CCCCC(=O)NC[C@@H]1CCN2CC[C@@H](CO[Si](c3ccccc3)(c3ccccc3)C(C)(C)C)N=C2N1)[C@@H](C)O
+C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](Oc2cc(-c3ccccc3)nc3cc(OC)ccc23)CN1C(=O)[C@@H](NC(=O)OC(C)(C)C)C(C)(C)C)C(=O)NS(=O)(=O)C1CC1
+COc1cc(N2CCN(CCCC(=O)N[C@H](C(=O)N3C[C@H](O)C[C@H]3C(=O)N[C@@H](C)c3ccc(-c4scnc4C)cc3)C(C)(C)C)CC2)ccc1Nc1ncc(Cl)c(Nc2ccccc2P(C)(C)=O)n1
+C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](Oc2cc(-c3ccccc3)nc3cc(OC)ccc23)CN1C(=O)[C@@H](NC(=O)OC(C)(C)C)C(C)(C)C)C(=O)NNS(=O)(=O)c1ccccc1
+CC[C@H](C)[C@H](NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CO)NC(=O)CCNC(=S)Nc1ccc(-c2c3ccc(=O)cc-3oc3cc(O)ccc23)c(C(=O)O)c1)C(=O)N[C@@H](CSCc1ccccc1)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N1CCC[C@H]1C(N)=O)[C@@H](C)O)C(C)C
+CSCC[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](C)NC(=O)[C@H](CO)NC(=O)[C@H](C)N)[C@@H](C)O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CS)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CS)C(=O)NCC(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)O)C(C)C)C(C)C
+CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NP(=O)(O)CCCCN1C(=O)c2ccccc2C1=O)C(=O)NCc1ccccc1
+CC(C)C[C@H](NCC(N)=O)c1cc(F)ccc1N1CCN(C(=O)[C@@H](Cc2ccc(Cl)cc2Cl)N2CCCC2=O)CC1

demo/example_smiles_IDs.txt ADDED Viewed

	@@ -0,0 +1,101 @@

+ID	SMILES
+CHEMBL3782097	CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)NO)NC(=O)OCc1ccccc1)C(C)C)[C@@H](O)CC(=O)NCCc1ccccc1
+CHEMBL3819704	N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1cc(I)c(O)c(I)c1)C(N)=O
+CHEMBL2368819	NC(=O)[C@@H]1C[C@H](NC(=O)C(F)(F)F)CN1C(=O)[C@H](N)CCC/N=C(\N)N[N+](=O)[O-]
+CHEMBL3545807	CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CCCNC(=N)N)NC(=O)OCc1ccccc1)[C@@H](O)CC(=O)NC1CCCCC1
+CHEMBL3302347	CC(C)C[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](N)Cc1ccccc1)C(=O)NCC(=O)N1CCC[C@H]1C(=O)NCC(=O)Nc1ccc(N(CCCl)CCCl)cc1
+CHEMBL1184757	C[C@H](NC(=O)[C@@H](CO)NS(=O)(=O)c1ccccc1)C(=O)N[C@H]1CCCN(C(=N)N)C1O
+CHEMBL2403897	C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](c2csc(-c3ccccc3)n2)CN1C(=O)[C@@H](NC(=O)OC1CCCC1)C(C)(C)C)C(=O)NS(=O)(=O)C1CC1
+CHEMBL1229044	CC[C@H](C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCC[N+](C)(C)C)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](N)CCCC[N+](C)(C)C)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
+CHEMBL2425403	CSCC[C@H](NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](C)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H](N)CC(C)C)[C@@H](C)O)C(=O)O
+CHEMBL2425396	CSCC[C@H](NC(=O)[C@H](Cc1cnc[nH]1)NC(=O)[C@H](CO)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H](N)Cc1cnc[nH]1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)O
+CHEMBL1181891	CC(C)(C)NC(=O)C1(C2CCCCC2)CCN(C(=O)[C@@H](Cc2ccc(F)cc2)NC(=O)[C@@H]2CNC3(CC3)CN2)CC1
+CHEMBL1185696	CC(=O)O[C@H]1C(=O)[C@@]2(C)[C@H]([C@H](OC(=O)c3ccccc3)[C@]3(O)CC(OC(=O)[C@H](OC(=O)NCCNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc4ccccc4)NC(=O)[C@@H](C)N)C(NC(=O)c4ccccc4)c4ccccc4)C(C)=C1C3(C)C)[C@]1(OC(C)=O)CO[C@@H]1C[C@@H]2O
+CHEMBL1189783	COC(=O)[C@@H](Cc1ccccc1)NC(=O)[C@H](NC(=O)CC(O)CC(CC(C)C)NC(=O)[C@@H](N)Cc1c[nH]cn1)C(C)C
+CHEMBL1229047	CC[C@H](C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](N)CCCC[N+](C)(C)C)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C(C)C)C(O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCC[N+](C)(C)C)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C
+CHEMBL418285	CC1OC(SCCCCCCNC(=O)[C@H](CCC(N)=O)NC(=O)[C@@H](C)N)C(O)C(O)C1O
+CHEMBL3787168	CC1(C)N([O])C(c2ccc(OCC(=O)NCCCNC(=O)[C@H](Cc3ccc(O)cc3)NC(=O)[C@@H](N)CCCNC(=N)N)cc2)=[N+]([O-])C1(C)C
+CHEMBL4302812	CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CSCCC[P+](C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CSCCC[P+](C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)CSCCC[P+](C)(C)C)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(N)=O
+CHEMBL1195733	CCN(CC)CCC(=O)NC(C(=O)N[C@H](Cc1cn(C=O)c2ccccc12)C(=O)N[C@@H](Cc1ccccc1)C(=O)N(C)Cc1ccccc1)C(C)O
+CHEMBL1179530	N[C@@H](Cc1cc2ccccc2[nH]1)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)NCCCC(=O)NCC(=O)NCCCCCCOP(=O)(O)Oc1ccccc1Cl
+CHEMBL5029048	C[C@@H](N)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCCCN)C(=O)NCCCCC(=O)OCCNc1nc(NCCc2ccccc2)c2cnn(/C=C/c3ccccc3)c2n1
+CHEMBL1199463	CCCC[PH](CCCC)(CCCC)Cc1ccc(NC(=O)C2Cc3ccccc3CN2C(=O)[C@@H](N)CCc2ccccc2)cc1
+CHEMBL2103901	CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)CCCCN)C(=O)N[C@@H](CCCCN)C(=O)O
+CHEMBL2165218	Cc1cc(C(=O)N[C@H](C(=O)N[C@@H](Cc2ccc(F)cc2)C(=O)N[C@@H](/C=C/C(=O)OCc2nc3cc(Cl)ccc3[nH]2)CCC(N)=O)C(C)C)no1
+CHEMBL4300381	CCN(CC)c1ccc2c(-c3ccc(S(=O)(=O)NCCCC[C@H](NC(=O)Cc4csc(=N)n4C)C(=O)N[C@@H](Cc4cn(Cc5ccccc5)c[n+]4C)C(=O)NC4CCN(C)CC4)cc3S(=O)(=O)[O-])c3ccc(=[N+](CC)CC)cc-3oc2c1
+CHEMBL3302723	CC(C)C[C@H](N)C(=O)NCC(=O)N1CCC[C@H]1C(=O)NCC(=O)Nc1ccc(N(CCCl)CCCl)cc1
+CHEMBL4301870	CC1=CC(C)=[N+]2C1=Cc1ccc(CCC(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](Cc3ccccc3)C(=O)N[C@@H](CSCCC[P+](C)(C)C)C(N)=O)n1[B-]2(F)F
+CHEMBL2304033	N=C(N)NCCC[C@@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](CC(=O)O)NC(=O)[C@@H](CO)NC(=O)[C@@H](N)CC(=O)O)C(=O)O
+CHEMBL2364835	CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@H]2CCCN2C[C@H]1C(=O)N[C@@H]1CCOc2ccccc21)C1CCC(F)(F)CC1
+CHEMBL1852804	COc1ccc(NC(=O)[C@@H]2Cc3ccc(OCC(=O)NO)cc3CN2C(=O)[C@H](C)N)cc1
+CHEMBL3740745	COc1cccc(COc2ccc([C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCNC(=N)N)NC(=O)c3cccs3)C(N)=O)cc2)c1
+CHEMBL5315308	CC(C)CC(N)C(=O)NCC(=O)NC[C@H](C)B1OC2CC3CC(C3(C)C)[C@@]2(C)O1
+CHEMBL1188598	CCN(CC)CCNC(=O)c1ccc(C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C=O)CC(C)C)c(NNN2CCCC2)c1
+CHEMBL1189941	CC(O)C(NC(=O)CCN)C(=O)N[C@H](Cc1cn(C=O)c2ccccc12)C(=O)N[C@@H](Cc1ccccc1)C(=O)N(C)Cc1ccccc1
+CHEMBL1191337	CCCN(CC(=O)N[C@H](C=O)CCCN=C(N)N)C(=O)[C@H]1CCCCN1
+CHEMBL3407793	CC(C)C[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(C)C)C(C)C)C(=O)N[C@@H](CO)C(=O)O
+CHEMBL1193469	CC(C)(N)C(=O)N[C@H](CCCc1ccccc1)C(=O)N1CCC2(CC1)CC(O)c1ccccc12
+CHEMBL3408302	CC[C@H](C)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](N)CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CSc1ccc2n1[B-](F)(F)[N+]1=CC=CC1=C2)C(=O)O
+CHEMBL4597997	O=C(NCCCCC[C@H](NC(=O)[C@@H]1C[C@@H](N2CCCCC2)CN1C(=O)[C@@H](CC1CCCCC1)NC(=O)c1ccc2ccccc2c1)B(O)O)NC1CCCCC1
+CHEMBL3410386	CC(=O)N[C@@H](CCCO/N=C/c1ccccc1)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCC(=O)O)C(N)=O
+CHEMBL1196235	COc1ccc(CC(C)(NC(=O)[C@@H]2CCCN2C(=O)CCCc2ccc(O)cc2)C(=O)NCCCN)cc1OC
+CHEMBL1181305	N=C(N)N1CCC(C(NS(=O)(=O)Cc2ccccc2)C(=O)NCC(=O)N[C@H]2CCCN(C(=N)N)C2O)CC1
+CHEMBL3787701	CCC(=O)NCCOCCOCCNC(=O)/N=C(\N)NCCC[C@H](NC(=O)[C@@H](N)CC(=O)O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1cnc[nH]1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(=O)O)[C@@H](C)CC)C(C)C
+CHEMBL1198214	N=C(N)c1ccc(CNC(=O)[C@@H]2CCCN2C(=O)[C@H](N)C2CCCCC2)cc1
+CHEMBL3304520	CC(C)C[C@H](NC(=O)CNC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)NCC(N)=O
+CHEMBL1179088	CCCCCCCC(=O)OC[C@H](NC(=O)C(C)(C)N)C(=O)N1CCC2(CC1)CN(S(C)(=O)=O)c1ccccc12
+CHEMBL3794663	COc1ccc(NC(C)=O)cc1C(=O)NNC(=O)[C@H](CCCCN)NC(=O)CCOC[C@H]1OC(OCCCNC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)OC(C)(C)C)C(C)C)[C@H](O)[C@@H](O)[C@@H]1O
+CHEMBL1852000	CSCC[C@H](NC(=O)[C@@H]1Cc2ccccc2CN1)C(=O)NO
+CHEMBL1207289	CSCC[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(C)=O)C(=O)NCC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](COS(=O)(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(N)=O
+CHEMBL525036	C=Cc1c(C)c2cc3nc(c4c5[nH]c(cc6nc(cc1[nH]2)C(C)=C6CC)c(C)c5C(=O)C4)[C@@H](CCC(=O)N[C@H](C(=O)N[C@@H](CO)C(=O)NCC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCCCNOC(=O)CCCN(C)c1ccc(/N=N/c2cccc4nc5ccc(N(CC)CC)cc5[n+](-c5ccccc5)c24)cc1)C(N)=O)[C@@H](C)O)[C@@H]3C
+CHEMBL2361923	CC(=O)NC(Cc1ccc([N+](=O)[O-])cc1)C(=O)NCC(N)C(=O)c1ccccc1
+CHEMBL3354497	CC(C)(C)[C@H](NC(=O)Cc1cc(Cl)cc(Cl)c1)C(=O)NCC(=O)NC/C=C/S(C)(=O)=O
+CHEMBL1199003	CC(C)CCOc1ccc2ccccc2c1-c1c(OCC(=O)N[C@H](CCCCN)C(=O)N[C@H](CCCN)C(=O)N[C@@H](CC(C)C)C(=O)OCc2ccccc2)ccc2ccccc12
+CHEMBL1183069	CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)c1[nH]c2ccc(OCCCCCN)cc2c1CCCCCCN)C(=O)N[C@@H](CC(C)C)C(=O)O
+CHEMBL3946803	CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C)C(C)C
+CHEMBL3985737	CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
+CHEMBL3984334	CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
+CHEMBL284201	CC[C@H](C)[C@H](NC(=O)[C@@H](N)C(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N1CCC[C@H]1C(=O)N[C@@H](C)C(=O)NCC(=O)O
+CHEMBL3890815	CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C
+CHEMBL3944455	CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
+CHEMBL3890020	CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
+CHEMBL3891294	CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
+CHEMBL2219891	COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)C[C@@](O)(C(=O)CO)C[C@@H]3O[C@H]1C[C@H]2[C@H](OCN2C(=O)OCc2ccc(NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc3ccccc3)NC(=O)[C@@H](C)N)cc2)[C@H](C)O1
+CHEMBL3983321	CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)CC(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(N)=O)C(C)C)C(C)C
+CHEMBL3914919	CC(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(N)=O)C(C)C)C(C)C
+CHEMBL1178333	CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)c1[nH]c2c(OCCCCCN=C(N)N)cccc2c1CCCCCCN=C(N)N)C(=O)N[C@@H](CC(C)C)C(=O)O
+CHEMBL1463226	CCN(CC(=O)NCC(=O)Nc1cccc(C)c1C)Cc1ccccc1
+CHEMBL5085501	CCn1c(-c2nonc2N)nc2c(C#CC(C)(C)O)ncc(OCCCNCCCC(=O)NCCOCCOCCOCCC(=O)N[C@H](C(=O)N3C[C@H](O)C[C@H]3C(=O)NCc3ccc(-c4scnc4C)cc3)C(C)(C)C)c21
+CHEMBL5286315	Cc1cc2c(s1)-n1c(C)nnc1[C@H](CC(=O)NCCOCCCOCC(=O)N[C@@H](C(=O)N1C[C@H](O)C[C@H]1C(=O)NCc1ccc(-c3scnc3C)cc1)C(C)(C)C)N=C2c1ccc(Cl)cc1
+CHEMBL2368817	NC(=O)[C@@H]1C[C@H](NC(=O)[C@H](N)CCC/N=C(\N)N[N+](=O)[O-])CN1
+CHEMBL5071325	CC(C)C[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)C[C@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1ccccc1)C(N)=O
+CHEMBL5285634	CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCCNC(=N)N)NC(=O)CC(c1ccccc1)c1ccccc1)[C@@H](C)O)[C@@H](C)CC)C(=O)N[C@@H](CCCCN)C(=O)N[C@H](C(=O)N[C@@H](Cc1cn(CCN2CCCc3cc(/C=C/C4=C(Br)C(/C=C/c5cc6c7c(c5)CCCN7CCC6)=[O+][B-](F)(C(F)(F)F)O4)ccc32)nn1)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O)[C@@H](C)CC)[C@@H](C)CC
+CHEMBL5185804	Cc1ncsc1-c1ccc(C2(NC(=O)[C@@H]3C[C@@H](O)CN3C(=O)[C@@H](NC(=O)[C@H]3CC4(C3)C[C@H](N3CCC(c5cnc(N6C7CCC6CN(c6cc(-c8ccccc8O)nnc6N)C7)nc5)CC3)C4)C(C)(C)C)CC2)cc1
+CHEMBL5202298	Cc1cc(C)c(CNC(=O)c2cc(-c3ccc(N4CCN(C(=O)CCCCCn5cc(CCCCCC(=O)N[C@@H](C(=O)N6C[C@H](O)C[C@H]6C(=O)NCc6ccc(-c7scnc7C)cc6)C(C)(C)C)nn5)CC4)nc3)cc3c2cnn3C(C)C)c(=O)[nH]1
+CHEMBL5090130	CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(C)=O)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(=O)O)[C@@H](C)CC
+CHEMBL5091695	CC[C@H](C)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](C)NC(=O)[C@@H](NC(=O)[C@H](CNC(=O)c1cccc(S(=O)(=O)F)c1)NC(C)=O)[C@@H](C)CC)C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(N)=O
+CHEMBL5090609	CC(C)[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](Cc1cccc(Cl)c1)NC(=O)C1CCN(C)CC1)C(=O)C(F)(F)F
+CHEMBL5073350	COc1ccc([C@H](NC(=O)[C@H](Cc2cccc(Cl)c2)NC(=O)c2ccc(CNC(=O)OC(C)(C)C)c(Cl)c2)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
+CHEMBL5089534	COc1ccc([C@H](NC(=O)[C@H](Cc2cccc(Cl)c2)NC(=O)c2cccc(OCC(=O)OC(C)(C)C)c2)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
+CHEMBL5088663	COc1ccc([C@H](NC(=O)[C@H](C)NC(=O)C(c2ccc(Cl)cc2)C(C)C)C(=O)N[C@H](C(=O)C(F)(F)F)C(C)C)cc1
+CHEMBL5195766	CC(C)C[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CCCCN)NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CCCN1C(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](Cc1c[nH]cn1)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](N)CC(C)C)C(=O)O
+CHEMBL5077064	CSCC[C@H](NC(=O)[C@H](C)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCCCN)C(C)C)[C@@H](C)O)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CS)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](CC(C)C)C(=O)O
+CHEMBL1766929	CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](N)CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O
+CHEMBL1564198	Cc1oc2c(c(C)cc3oc(=O)c(CC(=O)NCC(=O)NCC(=O)NCC(C)O)c(C)c32)c1C
+CHEMBL5198087	CC[C@H](C)[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CO)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(N)=O)NC(C)=O)[C@@H](C)CC)[C@@H](C)CC)[C@@H](C)CC)[C@@H](C)CC)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(N)=O
+CHEMBL5208458	Cc1cc(Cn2c(N3CC4(CNC4)C3)nc3c(N4CCN(CCCC(=O)NCCCC(=O)N[C@H](C(=O)N5C[C@H](O)C[C@H]5C(=O)N[C@@H](C)c5ccc(-c6scnc6C)cc5)C(C)(C)C)CC4)cc(Cl)cc32)cc(C)c1F
+CHEMBL5075875	Cn1ccc(-c2cc(Cl)c(Cl)c3[nH]c4c(c23)CN(C(=O)CNC(=O)CN2CCNCC2)CC4)n1
+CHEMBL1767019	CN(C)CCCCCNC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](N)Cc1c[nH]cn1
+CHEMBL323044	CN(Cc1ccc2ccccc2c1)C(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]1CCCN1/C(S)=N/Cc1ccccc1Cl
+CHEMBL63188	CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)CNC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)N[C@@H](CCCN=C(N)N)C(=O)O
+CHEMBL3138731	CSCC[C@H](NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CO)NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@H](CCCN=C(N)N)C(=O)O
+CHEMBL4596927	O=C(N[C@H](CC1CCCCC1)C(=O)N1C[C@H](N2CCCCC2)C[C@H]1C(=O)N[C@@H](CCCCN1CC2(CSC2)C1)B(O)O)c1ccc2ccccc2c1
+CHEMBL1797525	N=C(N)NCCCC[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](Cc1ccccc1)NS(=O)(=O)Cc1ccccc1)C(=O)Cc1ccccc1
+CHEMBL2068547	COC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)CCCCC(=O)NC[C@@H]1CCN2CC[C@@H](CO[Si](c3ccccc3)(c3ccccc3)C(C)(C)C)N=C2N1)[C@@H](C)O
+CHEMBL414993	C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](Oc2cc(-c3ccccc3)nc3cc(OC)ccc23)CN1C(=O)[C@@H](NC(=O)OC(C)(C)C)C(C)(C)C)C(=O)NS(=O)(=O)C1CC1
+CHEMBL5078877	COc1cc(N2CCN(CCCC(=O)N[C@H](C(=O)N3C[C@H](O)C[C@H]3C(=O)N[C@@H](C)c3ccc(-c4scnc4C)cc3)C(C)(C)C)CC2)ccc1Nc1ncc(Cl)c(Nc2ccccc2P(C)(C)=O)n1
+CHEMBL414992	C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@H](Oc2cc(-c3ccccc3)nc3cc(OC)ccc23)CN1C(=O)[C@@H](NC(=O)OC(C)(C)C)C(C)(C)C)C(=O)NNS(=O)(=O)c1ccccc1
+CHEMBL5281856	CC[C@H](C)[C@H](NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CO)NC(=O)CCNC(=S)Nc1ccc(-c2c3ccc(=O)cc-3oc3cc(O)ccc23)c(C(=O)O)c1)C(=O)N[C@@H](CSCc1ccccc1)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H](C(=O)N1CCC[C@H]1C(N)=O)[C@@H](C)O)C(C)C
+CHEMBL5094988	CSCC[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](C)NC(=O)[C@H](CO)NC(=O)[C@H](C)N)[C@@H](C)O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CS)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CS)C(=O)NCC(=O)N[C@@H](CCCCN)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)O)C(C)C)C(C)C
+CHEMBL419395	CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NP(=O)(O)CCCCN1C(=O)c2ccccc2C1=O)C(=O)NCc1ccccc1
+CHEMBL393789	CC(C)C[C@H](NCC(N)=O)c1cc(F)ccc1N1CCN(C(=O)[C@@H](Cc2ccc(Cl)cc2Cl)N2CCCC2=O)CC1

dictionary.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+pandas==2.2.2
+rdkit-pypi==2022.9.5
+tqdm==4.67.1
+argparse==1.4.0
+matplotlib==3.8.0
+gradio>=5.0.0
+cairosvg

run_pipeline.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import argparse
+import os
+import subprocess
+import sys
+import datetime
+def run_pipeline(input_file, output_dir, monomerizer_args=None):
+    # Ensure output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+    # Step 1: Run monomerizer.py with its arguments
+    print(f"Running monomerizer.py... Input: {input_file}, Output: {output_dir}")
+    monomerizer_command = [sys.executable, "src/monomerizer.py", "--input_file", input_file, "--output_dir", output_dir]
+    if monomerizer_args:
+        monomerizer_command.extend(monomerizer_args)
+    subprocess.run(monomerizer_command, check=True)
+    # Step 2: Run standardizer.py with its arguments
+    print("Running standardizer.py...")
+    standardizer_command = [sys.executable, "src/standardizer.py", "--output_dir", output_dir]
+    subprocess.run(standardizer_command, check=True)
+    # Step 3: Run prepare_GPepT_data.py to process sequences
+    print("Running prepare_GPepT_data.py...")
+    prepare_gpept_data_command = [sys.executable, "src/prepare_GPepT_data.py", "--output_dir", output_dir]
+    subprocess.run(prepare_gpept_data_command, check=True)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run a pipeline of programs sequentially.")
+    # Add arguments
+    parser.add_argument("--input_file", default="demo/example_smiles.txt", help="Input file for the pipeline")
+    parser.add_argument("--output_dir", default=f"output/{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}", help="Output directory")
+    parser.add_argument("--process_cyclic", action="store_true", help="Process cyclic compounds")
+    parser.add_argument("--min_amino_acids", type=int, help="Minimum number of amino acids required")
+    parser.add_argument("--batch_size", type=int, help="Batch size for processing")
+    parser.add_argument("--max_workers", type=int, help="Maximum number of workers for parallel processing")
+    parser.add_argument("-draw", action="store_true", help="Draw the molecules")
+    args = parser.parse_args()
+    # Prepare extra arguments for monomerizer.py
+    monomerizer_args = []
+    if args.process_cyclic:
+        monomerizer_args.append("-process_cyclic")
+    if args.min_amino_acids:
+        monomerizer_args.extend(["--min_amino_acids", int(args.min_amino_acids)])
+    if args.batch_size:
+        monomerizer_args.extend(["--batch_size", str(args.batch_size)])
+    if args.max_workers:
+        monomerizer_args.extend(["--max_workers", str(args.max_workers)])
+    if args.draw:
+        monomerizer_args.append("-draw")
+    # Run the pipeline
+    run_pipeline(args.input_file, args.output_dir, monomerizer_args=monomerizer_args)

src/analyse.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python3
+import pandas as pd
+from rdkit import Chem
+from rdkit.Chem import rdMolDescriptors, DataStructs, Descriptors
+import os, sys, requests, tqdm, re, argparse
+from collections import defaultdict
+import xml.etree.ElementTree as ET
+def add_canonical_smiles(df):
+    canonical_smiles_list = [
+        "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N",  # Trytophan (W)
+        "C(C[C@@H](C(=O)O)N)CNC(=N)N",             # Arginine (R)
+        "C1=C(NC=N1)C[C@@H](C(=O)O)N",             # Histidine (H)
+        "C1C[C@H](NC1)C(=O)O",                     # Proline (P)
+        "C(CCN)C[C@@H](C(=O)O)N",                  # Lysine (K)
+        "CSCC[C@@H](C(=O)O)N",                     # Methionine (M)
+        "C(CC(=O)N)[C@@H](C(=O)O)N",               # Asparagine (N)
+        "C([C@@H](C(=O)O)N)C(=O)N",                # Glutamine (Q)
+        "C(CC(=O)O)[C@@H](C(=O)O)N",               # Glutamic acid (E)
+        "OC(=O)C[C@@H](C(=O)O)N",                  # Aspartic acid (D)
+        "C1=CC(=CC=C1C[C@@H](C(=O)O)N)O",          # Tyrosine (Y)
+        "C1=CC=C(C=C1)C[C@@H](C(=O)O)N",           # Phenylalanine (F)
+        "CC[C@H](C)[C@@H](C(=O)O)N",               # Valine (V)
+        "CC(C)C[C@@H](C(=O)O)N",                   # Leucine (L)
+        "CC(C)[C@@H](C(=O)O)N",                    # Isoleucine (I)
+        "C[C@H]([C@@H](C(=O)O)N)O",                # Threonine (T)
+        "C([C@@H](C(=O)O)N)S",                     # Cysteine (C)
+        "C([C@@H](C(=O)O)N)O",                     # Serine (S)
+        "C[C@@H](C(=O)O)N",                        # Alanine (A)
+        "C(C(=O)O)N"                               # Glycine (G)
+    ]
+    one_letter_codes = ['W','R','H','P','K','M','N','Q','E','D','Y','F','V','L','I','T','C','S','A','G']
+    canonical_df = pd.DataFrame({
+        'ID': one_letter_codes,
+        'SMILES': canonical_smiles_list,
+        'CANONICAL': ['True'] * len(canonical_smiles_list),
+        'TERMINAL': ['NotTer'] * len(canonical_smiles_list),
+        'ROMol': [Chem.MolFromSmiles(smi) for smi in canonical_smiles_list]
+    })
+    return pd.concat([df, canonical_df], ignore_index=True)
+def cal_tanimoto(mol):
+    l_glycine = Chem.MolFromSmiles("C(C(=O)O)N")
+    fp1 = rdMolDescriptors.GetMorganFingerprint(mol, 2)
+    fp2 = rdMolDescriptors.GetMorganFingerprint(l_glycine, 2)
+    return DataStructs.TanimotoSimilarity(fp1, fp2)
+def fetch_pubchem_name(smiles):
+    try:
+        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/property/Title/JSON"
+        response = requests.get(url)
+        response.raise_for_status()
+        data = response.json()
+        return data['PropertyTable']['Properties'][0].get('Title', 'NULL')
+    except (requests.exceptions.RequestException, KeyError, IndexError):
+        return "NULL"
+def fetch_chembl_similarity(smiles, similarity_threshold=100):
+    try:
+        url = f"https://www.ebi.ac.uk/chembl/api/data/similarity/{smiles}/{similarity_threshold}"
+        response = requests.get(url)
+        response.raise_for_status()
+        root = ET.fromstring(response.content)
+        chembl_ids = [m.find('.//molecule_chembl_id').text for m in root.findall('.//molecule') if m.find('.//molecule_chembl_id') is not None]
+        return chembl_ids if chembl_ids else ["NULL"]
+    except requests.exceptions.RequestException:
+        return ["NULL"]
+def fetch_names(smiles):
+    pubchem_name = fetch_pubchem_name(smiles)
+    chembl_names = fetch_chembl_similarity(smiles)
+    return pubchem_name, ",".join(chembl_names)
+def fetch_rdkit_properties(smiles):
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return ["NULL"] * 7
+        weight = Descriptors.ExactMolWt(mol)
+        clogp = Descriptors.MolLogP(mol)
+        tpsa = Descriptors.TPSA(mol)
+        charge = Chem.GetFormalCharge(mol)
+        rotatable_bonds = Descriptors.NumRotatableBonds(mol)
+        h_donors = Descriptors.NumHDonors(mol)
+        h_acceptors = Descriptors.NumHAcceptors(mol)
+        return [weight, clogp, tpsa, charge, rotatable_bonds, h_donors, h_acceptors]
+    except Exception:
+        return ["NULL"] * 7
+def count_monomers(mols_df):
+    monomers_dict = defaultdict(int)
+    for sequence in mols_df['SEQUENCE']:
+        if isinstance(sequence, str) and len(sequence) > 0:
+            tokens = re.findall('[A-Z][^A-Z]*', sequence)
+            for token in tokens:
+                monomers_dict[token] += 1
+    return monomers_dict
+def main():
+    parser = argparse.ArgumentParser(description='Analyse non-natural amino acids (NNAA) from PubChem.')
+    parser.add_argument('--input_dir', help='Input directory containing the monomer data.', default='data/tmp')
+    parser.add_argument('--mols_file', help='File name relative to input_dir.', default='standard/sequences_standardized.txt')
+    parser.add_argument('-fetch_names', help='Fetch names from PubChem and ChEMBL.', action='store_true')
+    parser.add_argument('--target_type', help='Type of target: ncAAs or peptides?', default='ncAAs')
+    parser.add_argument('--output_file', help='Output CSV file name.', default='analysis.csv')
+    args = parser.parse_args()
+    mols_path = args.mols_file
+    output_path = os.path.join(args.input_dir, args.output_file)
+    df = pd.read_csv(mols_path, sep='\t')
+    df = df.dropna(subset=['SMILES']).drop_duplicates(subset=['SMILES'])
+    df['ROMol'] = df['SMILES'].apply(Chem.MolFromSmiles)
+    if args.fetch_names:
+        df[['PUBCHEM_NAME', 'CHEMBL_NAMES']] = df['SMILES'].apply(fetch_names).tolist()
+    df['Tanimoto_to_Glycine'] = df['ROMol'].apply(cal_tanimoto)
+    df[['MolWt', 'LogP', 'TPSA', 'FormalCharge', 'RotatableBonds', 'HydrogenDonors', 'HydrogenAcceptors']] = df['SMILES'].apply(fetch_rdkit_properties).tolist()
+    df.to_csv(output_path, index=False)
+    print(f"Processing completed. Results saved to {output_path}")
+if __name__ == "__main__":
+    main()

src/demonomerizer.py ADDED Viewed

	@@ -0,0 +1,211 @@

+#!/usr/bin/env python3
+import re, ast
+from rdkit import Chem
+import pandas as pd
+from tqdm import tqdm
+import argparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import os
+# Parse the input arguments
+parser = argparse.ArgumentParser(description="Preprocess the generated sequences file")
+parser.add_argument("--sequence_file", type=str, help="Path to the generated sequences file", default="sequences_generated.txt")
+parser.add_argument("--NNAA_file", type=str, help="Path to the NNAA file", default="dictionary.txt")
+parser.add_argument("--batch_size", type=int, help="Batch size for processing sequences", default=8)
+parser.add_argument("--output_dir", type=str, help="Output directory", default="output")
+parser.add_argument("--demonomerized_file", type=str, help="Output demonomerized file name", default="demonomerized.txt")
+args = parser.parse_args()
+valid_backbone = Chem.MolFromSmarts("[NH,NH2]CC(=O)")
+valid_backbone_OH = Chem.MolFromSmarts("[NH,NH2]CC(=O)O")
+peptide_bond_mol = Chem.MolFromSmarts("[N,n][C,c]C(=O)[*!O]") # [*!O] ensures it does not match AAter
+edge_C = 2
+edge_N = 0
+edge_O = 4
+name_smi_dict = {
+    # isomeric SMILES from pubchem. eg https://pubchem.ncbi.nlm.nih.gov/compound/Alanine except for Asp (from https://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=3309) and Arg (from https://en.wikipedia.org/wiki/Arginine)
+    "Wter": "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N",
+    "W": "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O))N",
+    "Rter": "C(C[C@@H](C(=O)O)N)CNC(=N)N",
+    "R": "C(C[C@@H](C(=O))N)CNC(=N)N",
+    "Hter": "C1=C(NC=N1)C[C@@H](C(=O)O)N",
+    "H": "C1=C(NC=N1)C[C@@H](C(=O))N",
+    "Pter": "C1C[C@H](NC1)C(=O)O",
+    "P": "C1C[C@H](NC1)C(=O)",
+    "Kter": "C(CCN)C[C@@H](C(=O)O)N",
+    "K": "C(CCN)C[C@@H](C(=O))N",
+    "Mter": "CSCC[C@@H](C(=O)O)N",
+    "M": "CSCC[C@@H](C(=O))N",
+    "Qter": "C(CC(=O)N)[C@@H](C(=O)O)N",
+    "Q": "C(CC(=O)N)[C@@H](C(=O))N",
+    "Nter": "C([C@@H](C(=O)O)N)C(=O)N",
+    "N": "C([C@@H](C(=O))N)C(=O)N",
+    "Eter": "C(CC(=O)O)[C@@H](C(=O)O)N",
+    "E": "C(CC(=O)O)[C@@H](C(=O))N",
+    "Dter": "OC(=O)C[C@@H](C(=O)O)N",
+    "D": "OC(=O)C[C@@H](C(=O))N",
+    "Yter": "C1=CC(=CC=C1C[C@@H](C(=O)O)N)O",
+    "Y": "C1=CC(=CC=C1C[C@@H](C(=O))N)O",
+    "Fter": "C1=CC=C(C=C1)C[C@@H](C(=O)O)N",
+    "F": "C1=CC=C(C=C1)C[C@@H](C(=O))N",
+    "Iter": "CC[C@H](C)[C@@H](C(=O)O)N",  # TODO add correct hydroxyl oxygen for every AA terminal
+    "I": "CC[C@H](C)[C@@H](C(=O))N",
+    "Lter": "CC(C)C[C@@H](C(=O)O)N",
+    "L": "CC(C)C[C@@H](C(=O))N",
+    "Vter": "CC(C)[C@@H](C(=O)O)N",
+    "V": "CC(C)[C@@H](C(=O))N",
+    "Tter": "C[C@H]([C@@H](C(=O)O)N)O",
+    "T": "C[C@H]([C@@H](C(=O))N)O",
+    "Cter": "C([C@@H](C(=O)O)N)S",
+    "C": "C([C@@H](C(=O))N)S",
+    "Ster": "C([C@@H](C(=O)O)N)O",
+    "S": "C([C@@H](C(=O))N)O",
+    "Ater": "C[C@@H](C(=O)O)N",
+    "A": "C[C@@H](C(=O))N",
+    "Gter": "C(C(=O)O)N",
+    "G": "C(C(=O))N",
+}
+def mark_edge(amino, pattern, edge_position):
+    matched_indices = amino.GetSubstructMatch(pattern)
+    edge_position = matched_indices[edge_position]
+    edge_atom = amino.GetAtomWithIdx(edge_position)
+    edge_atom.SetProp("atomNote", "edge")
+    return edge_atom
+def mark_edge_NNAA(NNAA, bond_sites):
+    try:
+        for i in bond_sites:
+            integer = int(i)
+            atom = NNAA.GetAtomWithIdx(integer)
+            atom.SetProp("atomNote", "edge")
+    except:
+        print("No bond sites")
+        pass
+def mark_bond_site(mol, index, symbol):
+    for atom in mol.GetAtoms():
+        if atom.HasProp("atomNote") and atom.GetSymbol() == symbol:
+            atom.SetProp("atomNote", str(index))
+def clear_props(atom1, atom2):
+    atom1.ClearProp("atomNote")
+    atom2.ClearProp("atomNote")
+def get_amino_mol(amino_name, name_smi_dict, NNAA_file):
+    for aa_name, aa_smi in name_smi_dict.items():
+        if aa_name == amino_name:
+            amino_mol = Chem.MolFromSmiles(aa_smi)
+            try:
+                mark_edge(amino_mol, valid_backbone, edge_N)
+                mark_edge(amino_mol, valid_backbone, edge_C)
+            except:
+                for index, row in NNAA_file.iterrows():
+                    name = row["ID"]
+                    if name == amino_name:
+                        bond_info = ast.literal_eval(row["Bond sites"])
+                        smiles_rootedAtAtom0 = bond_info[0]
+                        bond_sites = bond_info[1:]
+                        amino_mol = Chem.MolFromSmiles(smiles_rootedAtAtom0)
+                        mark_edge_NNAA(amino_mol, bond_sites)
+            return amino_mol
+def process_batch(batch_df):
+    results = []
+    for index, row in batch_df.iterrows():
+        result_index, result_smiles = process_row(index, row)
+        results.append((result_index, result_smiles))
+    return results
+def process_row(index, row):
+    if "SMILES" not in row or type(row["SMILES"]) == float or len(row["SMILES"]) == 0:
+        seq = row["SEQUENCE"]
+        split_seq = regex.findall(seq)
+        ordered_aminos = []
+        try:
+            for alphabet in split_seq:
+                amino_mol = get_amino_mol(alphabet, name_smi_dict, NNAA_file)
+                ordered_aminos.append(amino_mol)
+            # Replace the last amino with the terminal amino
+            amino_ter = split_seq[-1]
+            if not "ter" in amino_ter and not amino_ter.startswith("Z"):
+                amino_ter = f"{amino_ter}ter"
+            last_mol = get_amino_mol(amino_ter, name_smi_dict, NNAA_file)
+            ordered_aminos[-1] = last_mol
+            combined = ordered_aminos[0]
+            for i in range(len(ordered_aminos)-1):
+                mark_bond_site(combined, i, "C")
+                next_amino = ordered_aminos[i+1]
+                mark_bond_site(next_amino, i+1, "N")
+                combined = Chem.CombineMols(combined, next_amino)
+                rwmol = Chem.RWMol(combined)
+                for atom1 in rwmol.GetAtoms():
+                    if atom1.HasProp("atomNote") and atom1.GetProp("atomNote") == f"{i}":
+                        for atom2 in rwmol.GetAtoms():
+                            if atom2.HasProp("atomNote") and atom2.GetProp("atomNote") == f"{i+1}":
+                                rwmol.AddBond(atom1.GetIdx(), atom2.GetIdx(), Chem.BondType.SINGLE)
+                                clear_props(atom1, atom2)
+                                if len(rwmol.GetSubstructMatches(peptide_bond_mol)) == i+1:
+                                    combined = rwmol.GetMol()
+                                    break
+            result = Chem.MolToSmiles(combined, isomericSmiles=True, rootedAtAtom=0, canonical=True)
+            if '.' in result:
+                return index, None  # Indicates unbound atoms
+            return index, result
+        except Exception as e:
+            print(f"Error in sequence: {seq}")
+            return index, None
+    return index, row.get("SMILES")  # Return the existing SMILES if present
+NNAA_file = pd.read_csv(args.NNAA_file, sep="\t")
+for index, row in NNAA_file.iterrows():
+    smiles = row["SMILES"]
+    name = row["ID"]
+    NNAA = Chem.MolFromSmiles(smiles)
+    if NNAA.HasSubstructMatch(valid_backbone_OH):
+        rwmol = Chem.RWMol(NNAA)
+        OH_i = NNAA.GetSubstructMatch(valid_backbone_OH)[edge_O]
+        rwmol.RemoveAtom(OH_i)
+        noOH_smiles = Chem.MolToSmiles(rwmol)
+        name_smi_dict[name] = noOH_smiles
+    if not name.startswith("Z"):
+        name = f"{name}ter"
+    name_smi_dict[name] = smiles
+tokenizer = r"X\d+|Z\d+|[A-WY]"
+regex = re.compile(tokenizer)
+df = pd.read_csv(args.sequence_file, sep="\t")
+# add a column for SMILES
+df["SMILES"] = ""
+# Process in batches
+batch_size = args.batch_size
+batches = [df[i:i + batch_size] for i in range(0, df.shape[0], batch_size)]
+# Process batches in parallel
+with ThreadPoolExecutor() as executor:
+    futures = {executor.submit(process_batch, batch): batch for batch in batches}
+    for future in tqdm(as_completed(futures), total=len(futures)):
+        results = future.result()
+        for index, smiles in results:
+            if smiles:
+                df.at[index, "SMILES"] = smiles
+# ✅ Use output_dir in your logic
+os.makedirs(args.output_dir, exist_ok=True)
+output_file = os.path.join(args.output_dir, args.demonomerized_file)
+# Assuming `df` is your final DataFrame
+df.to_csv(output_file, sep="\t", index=False)

src/draw.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import matplotlib.pyplot as plt
+from matplotlib.colors import ListedColormap
+from collections import defaultdict
+from rdkit.Chem.Draw import rdMolDraw2D
+import numpy as np
+class MoleculeDrawer:
+    def __init__(self, output_dir="output/tmp"):
+        self.output_dir = os.path.join(output_dir, "raw/images")
+        os.makedirs(self.output_dir, exist_ok=True)
+        self.aa2color_dict = {
+            "Asp": (0.902, 0.039, 0.039), "Glu": (0.961, 0.1, 0.537), "Arg": (0.078, 0.353, 1), "Lys": (0.42, 0.353, 1),
+            "His": (0.51, 0.51, 0.824), "Tyr": (0.196, 0.196, 0.667), "Phe": (0.341, 0.196, 0.667), "Trp": (0.706, 0.353, 0.706),
+            "Asn": (0, 0.863, 0.863), "Gln": (0.5, 0.82, 0.863), "Met": (0.902, 0.902, 0), "Cys": (0.722, 0.902, 0),
+            "Ser": (0.98, 0.588, 0), "Thr": (0, 0.612, 0.412), "Gly": (0.98, 0.922, 0.922), "Ala": (0.784, 0.784, 0.639),
+            "Val": (0.059, 0.51, 0.059), "Leu": (0.29, 0.51, 0.059), "Ile": (0.29, 0.51, 0.471), "Pro": (1, 0.588, 0.51)
+        }
+    def sort_atom_highlights(self, mol):
+        atom_highlights = defaultdict(list)
+        for atom_idx in range(mol.GetNumAtoms()):
+            labelled_atom = mol.GetAtomWithIdx(atom_idx)
+            AA_label = labelled_atom.GetProp("AA")
+            if self.label_belongs_to_AA(AA_label):
+                three_letter_label = AA_label[:3]
+                atom_highlights[atom_idx].append(self.aa2color_dict[three_letter_label])
+        # Convert defaultdict to dict of lists
+        return {k: list(v) for k, v in atom_highlights.items()}
+    def create_colormap(self):
+        legend_data = [(aa[:3], color) for aa, color in self.aa2color_dict.items() if aa != "Unk"]
+        fig, ax = plt.subplots(figsize=(1, 1))
+        cmap = ListedColormap([color for _, color in legend_data])
+        cax = ax.matshow(np.arange(len(legend_data)).reshape(1, -1), cmap=cmap)
+        cbar = fig.colorbar(cax, ticks=np.arange(len(legend_data)), aspect=5)
+        cbar.set_ticklabels([label for label, _ in legend_data])
+        cbar.ax.tick_params(labelsize=3)
+        ax.axis("off")
+        plt.savefig(os.path.join(self.output_dir, "colormap.png"), bbox_inches="tight", dpi=300)
+        plt.close()
+    def draw_input_mol(self, mol, mol_index, seq, bond_highlights):
+        atom_highlights = self.sort_atom_highlights(mol)
+        # Ensure bond_highlights is a dict of lists
+        bond_highlights = {k: list(v) for k, v in bond_highlights.items()} if bond_highlights else {}
+        mol_name = f"mol_{mol_index}"
+        legend = f'{mol_name}\nseq: {seq}\n{"8< = peptide bond"}\nAA_NAME:SEEN_COUNT:SEQUENCE_POSITION\n'
+        self.draw_mol(mol, atom_highlights, bond_highlights, legend, mol_name)
+        self.create_colormap()
+    def draw_mol(self, mol, atom_highlights, bond_highlights, legend, mol_name):
+        view = rdMolDraw2D.MolDraw2DSVG(600, 300)
+        view.drawOptions().useBWAtomPalette()
+        view.DrawMoleculeWithHighlights(mol, legend, dict(atom_highlights), dict(bond_highlights), {}, {})
+        view.FinishDrawing()
+        with open(os.path.join(self.output_dir, f"{mol_name}.svg"), "w") as f:
+            f.write(view.GetDrawingText())
+    def label_belongs_to_AA(self, label):
+        shorter_label = label[:3]
+        return shorter_label != "Unk" and not label.startswith("X")

src/monomer_analyzer.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/monomerizer.py ADDED Viewed

	@@ -0,0 +1,882 @@

+#!/usr/bin/env python3
+# This script takes a isomeric SMILES file as input and outputs a seq (like fasta) file with the corresponding amino acid sequence.
+# The script also outputs a isomeric SMILES file with the NNAA (non-natural amino acid) labeled as "X".
+# Any compound connected to a valid backbone is considered as individual amino acid.
+# The NNAAs that do not possess a valid backbone "[NH,NH2]CC(=O)O" required to continuously form peptide bonds, are considered as terminal modifications, and are named as "X0ter", "X1ter", etc.
+import os
+from rdkit import Chem
+from rdkit.Chem import RegistrationHash
+from rdkit.Chem.RegistrationHash import HashLayer
+from collections import deque
+import argparse
+from tqdm import tqdm
+import pandas as pd
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import multiprocessing as mp
+from draw import MoleculeDrawer
+from collections import defaultdict
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Process SMILES files and generate amino acid sequences.")
+    parser.add_argument("--input_file", default="demo/example_smiles.txt", help="Input SMILES file")
+    parser.add_argument("-process_cyclic", action="store_true", help="Process cyclic peptides")
+    parser.add_argument("--min_amino_acids", type=int, default=3, help="Minimum number of amino acids")
+    parser.add_argument("--batch_size", type=int, default=100, help="Batch size")
+    parser.add_argument("--output_dir", default="output/tmp", help="Output directory")
+    parser.add_argument("--max_workers", type=int, default=mp.cpu_count(), help="Maximum number of workers for parallel processing")
+    parser.add_argument("-draw", action="store_true", help="Draw molecules")
+    return parser.parse_args()
+name_smi_dict = {
+    # isomeric SMILES from pubchem. eg https://pubchem.ncbi.nlm.nih.gov/compound/Alanine except for Asp (from https://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=3309) and Arg (from https://en.wikipedia.org/wiki/Arginine)
+    "TrpTer": "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N",
+    "Trp": "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O))N",
+    "ArgTer": "C(C[C@@H](C(=O)O)N)CNC(=N)N",
+    "Arg Ter": "NC(N)=NCCC[C@H](N)C(=O)O",
+    "Arg": "C(C[C@@H](C(=O))N)CNC(=N)N",
+    "Arg2": "NC(N)=NCCC[C@H](N)C(=O)",
+    "HisTer": "C1=C(NC=N1)C[C@@H](C(=O)O)N",
+    "His": "C1=C(NC=N1)C[C@@H](C(=O))N",
+    "ProTer": "C1C[C@H](NC1)C(=O)O",
+    "Pro": "C1C[C@H](NC1)C(=O)",
+    "LysTer": "C(CCN)C[C@@H](C(=O)O)N",
+    "Lys": "C(CCN)C[C@@H](C(=O))N",
+    "MetTer": "CSCC[C@@H](C(=O)O)N",
+    "Met": "CSCC[C@@H](C(=O))N",
+    "GlnTer": "C(CC(=O)N)[C@@H](C(=O)O)N",
+    "Gln": "C(CC(=O)N)[C@@H](C(=O))N",
+    "AsnTer": "C([C@@H](C(=O)O)N)C(=O)N",
+    "Asn": "C([C@@H](C(=O))N)C(=O)N",
+    "GluTer": "C(CC(=O)O)[C@@H](C(=O)O)N",
+    "Glu": "C(CC(=O)O)[C@@H](C(=O))N",
+    "AspTer": "OC(=O)C[C@@H](C(=O)O)N",
+    "Asp": "OC(=O)C[C@@H](C(=O))N",
+    "TyrTer": "C1=CC(=CC=C1C[C@@H](C(=O)O)N)O",
+    "Tyr": "C1=CC(=CC=C1C[C@@H](C(=O))N)O",
+    "PheTer": "C1=CC=C(C=C1)C[C@@H](C(=O)O)N",
+    "Phe": "C1=CC=C(C=C1)C[C@@H](C(=O))N",
+    "IleTer": "CC[C@H](C)[C@@H](C(=O)O)N",  # TODO add correct hydroxyl oxygen for every AA terminal
+    "Ile": "CC[C@H](C)[C@@H](C(=O))N",
+    "LeuTer": "CC(C)C[C@@H](C(=O)O)N",
+    "Leu": "CC(C)C[C@@H](C(=O))N",
+    "ValTer": "CC(C)[C@@H](C(=O)O)N",
+    "Val": "CC(C)[C@@H](C(=O))N",
+    "ThrTer": "C[C@H]([C@@H](C(=O)O)N)O",
+    "Thr": "C[C@H]([C@@H](C(=O))N)O",
+    "CysTer": "C([C@@H](C(=O)O)N)S",
+    "Cys": "C([C@@H](C(=O))N)S",
+    "SerTer": "C([C@@H](C(=O)O)N)O",
+    "Ser": "C([C@@H](C(=O))N)O",
+    "AlaTer": "C[C@@H](C(=O)O)N",
+    # FBR: I wonder if we should have a SMILES for AlaStart
+    "Ala": "C[C@@H](C(=O))N",
+    # Saturated the carbon
+    "GlyTer": "C(C(=O)O)N",
+    "Gly": "C(C(=O))N",
+}
+smi2mol = {}
+for aa_name, aa_smi in name_smi_dict.items():
+    smi2mol[aa_name] = Chem.MolFromSmiles(aa_smi)
+peptide_bond_mol = Chem.MolFromSmarts("[N,n][C,c]C(=O)[*!O]") # [*!O] ensures it does not match AAter
+edge_C_position = 2
+edge_N_position = 4
+valid_backbone = Chem.MolFromSmarts("[NH,NH2]CC(=O)[OH]")
+loose_backbone = Chem.MolFromSmarts("[C,c](C(=O)O)[N,n]") # Also detects backbone that contains a benzene ring. Used for removing -OH
+OH_position = 3
+oxygen = Chem.Atom(8)
+three2one_letter = {
+    "Ala": "A",
+    "Gly": "G",
+    "Ile": "I",
+    "Leu": "L",
+    "Pro": "P",
+    "Val": "V",
+    "Phe": "F",
+    "Trp": "W",
+    "Tyr": "Y",
+    "Asp": "D",
+    "Glu": "E",
+    "Arg": "R",
+    "His": "H",
+    "Lys": "K",
+    "Ser": "S",
+    "Thr": "T",
+    "Cys": "C",
+    "Met": "M",
+    "Asn": "N",
+    "Gln": "Q",
+}
+aa2color_dict = {
+    "Asp": (0.902, 0.039, 0.039),
+    "Glu": (0.961, 0.1, 0.537),
+    "Arg": (0.078, 0.353, 1),
+    "Lys": (0.42, 0.353, 1),
+    "His": (0.51, 0.51, 0.824),
+    "Tyr": (0.196, 0.196, 0.667),
+    "Phe": (0.341, 0.196, 0.667),
+    "Trp": (0.706, 0.353, 0.706),
+    "Asn": (0, 0.863, 0.863),
+    "Gln": (0.5, 0.82, 0.863),
+    "Met": (0.902, 0.902, 0),
+    "Cys": (0.722, 0.902, 0),
+    "Ser": (0.98, 0.588, 0),
+    "Thr": (0, 0.612, 0.412),
+    "Gly": (0.98, 0.922, 0.922),
+    "Ala": (0.784, 0.784, 0.639),
+    "Val": (0.059, 0.51, 0.059),
+    "Leu": (0.29, 0.51, 0.059),
+    "Ile": (0.29, 0.51, 0.471),
+    "Pro": (1, 0.588, 0.51),
+}
+# no integer in the tuple was already matched
+def tuple_fully_unmatched(indexes_group, already_matched, mol_a):
+    res = True
+    for i in indexes_group:
+        if mol_a.GetAtomWithIdx(i).HasProp("AA") and mol_a.GetAtomWithIdx(i).GetProp(
+            "AA"
+        ).startswith("Unk"):
+            res = False
+            break
+        if i in already_matched:
+            res = False
+            break
+    return res
+def match_AA(mol_b, dict):
+    atoms_already_matched = set()
+    for aa_name, aa_mol in dict.items():
+        i = 0
+        for atom_indexes_group in mol_b.GetSubstructMatches(aa_mol, useChirality=True):
+            prop = aa_name + ":" + str(i)
+            if tuple_fully_unmatched(atom_indexes_group, atoms_already_matched, mol_b):
+                for a_i in atom_indexes_group:
+                    mol_b.GetAtomWithIdx(a_i).SetProp("AA", prop)
+                    atoms_already_matched.add(a_i)
+                i += 1
+def find_peptide_bonds(mol_c):
+    atom_indices_surrounding_peptide_bond = []
+    for bonded_AA in mol_c.GetSubstructMatches(peptide_bond_mol):
+        C_idx = mol_c.GetAtomWithIdx(bonded_AA[edge_C_position]).GetIdx()
+        N_idx = mol_c.GetAtomWithIdx(bonded_AA[edge_N_position]).GetIdx()
+        atom_indices_surrounding_peptide_bond.append([C_idx, N_idx])
+    return atom_indices_surrounding_peptide_bond
+def set_peptide_bond_prop(mol, atom_indices_surrounding_peptide_bond):
+    peptide_bonds = []
+    for C_idx, N_idx in atom_indices_surrounding_peptide_bond:
+        mol.GetAtomWithIdx(C_idx).SetProp("bond_site", "C")
+        mol.GetAtomWithIdx(N_idx).SetProp("bond_site", "N")
+        peptide_bond = mol.GetBondBetweenAtoms(C_idx, N_idx)
+        peptide_bond.SetProp("bondNote", "8<")
+        peptide_bond.SetProp("peptide_bond", "peptide_bond")
+        peptide_bonds.append(peptide_bond.GetIdx())
+    return peptide_bonds
+def label_peptide_bonds(mol_e):
+    atom_indices_surrounding_peptide_bond = find_peptide_bonds(mol_e)
+    peptide_bonds = set_peptide_bond_prop(mol_e, atom_indices_surrounding_peptide_bond)
+    return peptide_bonds
+def label_NNAAs(mol_e, peptide_bonds):
+    NNAA_idx = 0
+    for a_i in range(mol_e.GetNumHeavyAtoms()):
+        the_atom = mol_e.GetAtomWithIdx(a_i)
+        if not the_atom.HasProp("AA"):
+            atom_index_of_the_NNAA = the_atom.GetIdx()
+            label_unmatched_NNAA(
+                mol_e, atom_index_of_the_NNAA, NNAA_idx, peptide_bonds
+            )
+            NNAA_idx += 1
+    return NNAA_idx
+def prepare_graph(first_atom_index):
+    queue = deque([first_atom_index])
+    visited = set([first_atom_index])
+    return queue, visited
+def enqueue_neighbor_indices(mol_f, atom, queue, visited):
+    neighbor_indices = [neighbor[1] for neighbor in get_neighbors(mol_f, atom)]
+    for neighbor_atom_idx in neighbor_indices:
+        if neighbor_atom_idx not in visited:
+            queue.append(neighbor_atom_idx)
+            visited.add(neighbor_atom_idx)
+    return queue, visited
+def get_neighbors(mol_g, atom):
+    neighbors_and_indices = []
+    for neighbor_atom in atom.GetNeighbors():
+        neighbor_atom_idx = neighbor_atom.GetIdx()
+        neighbor_atom = mol_g.GetAtomWithIdx(neighbor_atom_idx)
+        neighbors_and_indices.append([neighbor_atom, neighbor_atom_idx])
+    return neighbors_and_indices
+def cross_peptide_bond(mol_f, current_atom_idx, neighbor_idx, peptide_bonds):
+    bond_i = mol_f.GetBondBetweenAtoms(current_atom_idx, neighbor_idx).GetIdx()
+    return bond_i in peptide_bonds
+def NNAA_continues(neighbor_atom, first_AA_observed):
+    return (
+        neighbor_atom.HasProp("AA") == False
+        or neighbor_atom.GetProp("AA") == first_AA_observed
+    )
+def get_current_atom_with_prop(mol_h, atom_idx_queue, prop):
+    current_atom_idx = atom_idx_queue.popleft()
+    current_atom = mol_h.GetAtomWithIdx(current_atom_idx)
+    current_atom.SetProp("AA", prop)
+    return current_atom, current_atom_idx
+def label_unmatched_NNAA(mol, atom_index_of_the_NNAA, NNAA_idx, peptide_bonds):
+    atom_idx_queue, visited_atoms = prepare_graph(atom_index_of_the_NNAA)
+    first_AA_observed = None
+    prop = f"Unk{NNAA_idx}"
+    while atom_idx_queue:
+        current_atom, current_atom_idx = get_current_atom_with_prop(
+            mol, atom_idx_queue, prop
+        )
+        neighbors_and_indices = get_neighbors(mol, current_atom)
+        for neighbor in neighbors_and_indices:
+            neighbor_atom, neighbor_idx = neighbor
+            if neighbor_idx not in visited_atoms and not cross_peptide_bond(
+                mol, current_atom_idx, neighbor_idx, peptide_bonds
+            ):
+                visited_atoms.add(neighbor_idx)
+                if NNAA_continues(neighbor_atom, first_AA_observed):
+                    atom_idx_queue.append(neighbor_idx)
+                elif first_AA_observed is None:  # first_AA_observed unseen
+                    first_AA_observed = neighbor_atom.GetProp("AA")
+                    atom_idx_queue.append(neighbor_idx)
+def get_first_base_aa(mol_j, first_atom_index):
+    first_atom = mol_j.GetAtomWithIdx(first_atom_index)
+    current_base_aa = first_atom.GetProp("AA")
+    return current_base_aa
+def label_boundary_bonds(mol):
+    for bond in mol.GetBonds():
+        atom1_i, atom2_i, prop1, prop2 = get_connected_atoms_and_props(bond, mol)
+        if (
+            prop1 != prop2
+        ):
+            bond.SetProp("boundary", "boundary")
+            mol.GetAtomWithIdx(atom1_i).SetProp("bond_site", "bond_site")
+            mol.GetAtomWithIdx(atom2_i).SetProp("bond_site", "bond_site")
+def add_order_to_atomNote(mol_v, aa_order, current_base_aa):
+    for atom_idx in range(mol_v.GetNumAtoms()):
+        atom = mol_v.GetAtomWithIdx(atom_idx)
+        if atom.GetProp("AA") == current_base_aa:
+            atom.SetProp("atomNote", f"{current_base_aa}:{aa_order}")
+def reorder_AAs(mol_k, first_atom_index):
+    atom_idx_queue, visited_atom_indices = prepare_graph(first_atom_index)
+    aa_list = []
+    aa_order = 1
+    current_base_aa = get_first_base_aa(mol_k, first_atom_index)
+    while atom_idx_queue:
+        add_order_to_atomNote(mol_k, aa_order, current_base_aa)
+        atom_index = atom_idx_queue.popleft()
+        the_atom = mol_k.GetAtomWithIdx(atom_index)
+        aa_in_question = the_atom.GetProp("AA")
+        if current_base_aa != aa_in_question:
+            current_base_aa, atom_idx_queue = switch_base_and_empty_queue(
+                aa_list, current_base_aa, aa_in_question, atom_idx_queue, atom_index
+            )
+            aa_order += 1
+        enqueue_neighbor_indices(mol_k, the_atom, atom_idx_queue, visited_atom_indices)
+    aa_list.append(current_base_aa)  # append the last AA
+    return aa_list
+def switch_base_and_empty_queue(
+    aa_list, current_base_aa, aa_in_question, atom_idx_queue, idx
+):
+    aa_list.append(current_base_aa)
+    current_base_aa = aa_in_question
+    atom_idx_queue = deque([idx])
+    return current_base_aa, atom_idx_queue
+def label_belongs_to_AA(label):
+    shorter_label = label[:3]
+    return shorter_label != "Unk" and not label.startswith("X")
+def record_if_terminal(peptide_bonded_props, peptide_bonded_atoms, prop, atom):
+    if (
+        prop in peptide_bonded_props
+    ):  # the peptide bond was seen twice i.e. it has both ends
+        peptide_bonded_props.remove(prop)
+    else:
+        peptide_bonded_props.append(prop)
+        peptide_bonded_atoms.append(atom)
+def get_first_atom_index(mol_l, peptide_bonded_props, peptide_bonded_atoms):
+    first_atom_index = 0
+    for a_i in range(mol_l.GetNumAtoms()):
+        a = mol_l.GetAtomWithIdx(a_i)
+        if (
+            a_i in peptide_bonded_atoms
+            and a.GetProp("AA") in peptide_bonded_props
+            and a.GetSymbol() == "C"
+        ):
+            first_atom_index = a_i
+            break
+    return first_atom_index
+def mol_is_cyclic_peptide(mol_u, ignore_cyclic_peptide):
+    if ignore_cyclic_peptide == False:
+        return False
+    for bond in mol_u.GetBonds():  # for any bond including peptide bonds
+        if bond.IsInRing() and (bond.HasProp("boundary") or bond.HasProp("peptide_bond")):
+            return True
+def search_terminal_AA(mol_m):  # for highlight and searching terminal AA
+    peptide_bonded_props, peptide_bonded_atoms = [], []
+    for bond in mol_m.GetBonds():  # for any bond including peptide bonds
+        atom1_i, atom2_i, prop1, prop2 = get_connected_atoms_and_props(bond, mol_m)
+        if bond.HasProp(
+            "peptide_bond"
+        ):  # will remain in the list only if it is connected to a terminal AA
+            record_if_terminal(
+                peptide_bonded_props, peptide_bonded_atoms, prop1, atom1_i
+            )
+            record_if_terminal(
+                peptide_bonded_props, peptide_bonded_atoms, prop2, atom2_i
+            )
+    return peptide_bonded_props, peptide_bonded_atoms
+def get_connected_atoms_and_props(bond, mol_t):
+    atom1_i, atom2_i = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+    prop1, prop2 = mol_t.GetAtomWithIdx(atom1_i).GetProp("AA"), mol_t.GetAtomWithIdx(
+        atom2_i
+    ).GetProp("AA")
+    return atom1_i, atom2_i, prop1, prop2
+def write_seq(aa_list):
+    split_seq = []
+    for aa in aa_list:
+        if aa[:3] == ("Unk"):
+            acid = "?"
+        elif aa.startswith("X"):
+            acid = aa.split(":")[0]
+        else:
+            acid = three2one_letter[aa[:3]]
+        split_seq.append(acid)
+    return split_seq
+def get_NNAAs(mol):
+    rwmol = Chem.RWMol(mol)
+    remove_peptide_bonds(rwmol) # this needs to come before remove_atoms
+    remove_atoms(rwmol, mol, label_belongs_to_AA)
+    try:
+        return Chem.GetMolFrags(rwmol, asMols=True, sanitizeFrags=True)
+    except ValueError:
+        return "error"
+def remove_atoms(rwmol, mol, func, **kwargs):
+    atom_number = mol.GetNumAtoms() - 1
+    while atom_number >= 0:
+            prop = rwmol.GetAtomWithIdx(atom_number).GetProp("AA")
+            if func(prop, **kwargs):
+                rwmol.RemoveAtom(atom_number)
+            atom_number -= 1
+def add_OH(rwmol, begin_atom_idx, end_atom_idx):
+    rwmol.AddAtom(oxygen)
+    oxygen_idx = rwmol.GetNumAtoms() -1
+    if rwmol.GetAtomWithIdx(begin_atom_idx).GetAtomicNum() == 6: # Carbon
+        rwmol.AddBond(begin_atom_idx, oxygen_idx, Chem.BondType.SINGLE)
+    elif rwmol.GetAtomWithIdx(end_atom_idx).GetAtomicNum() == 6: # Carbon
+        rwmol.AddBond(oxygen_idx, end_atom_idx, Chem.BondType.SINGLE)
+def remove_peptide_bonds(rwmol):
+    current_bond_idx = rwmol.GetNumBonds() - 1
+    while current_bond_idx >= 0:
+        current_bond = rwmol.GetBondWithIdx(current_bond_idx)
+        if current_bond.HasProp("peptide_bond") and current_bond.IsInRing() == False:
+            begin_atom_idx, end_atom_idx = current_bond.GetBeginAtomIdx(), current_bond.GetEndAtomIdx()
+            rwmol.RemoveBond(
+                begin_atom_idx, end_atom_idx
+            )
+            add_OH(rwmol, begin_atom_idx, end_atom_idx)
+        current_bond_idx -= 1
+def detect_terminal(NNAA):
+    if NNAA.HasSubstructMatch(valid_backbone):
+        return "NotTer"
+    else:
+        return "ter" # don't use capital letter, for tokenization
+def enlist_NNAA(new_NNAA, df, ter_or_not, bond_atom_indices):
+    new_smi = Chem.MolToSmiles(new_NNAA, isomericSmiles=True, canonical=True)
+    new_smi_rootedAtAtom0 = Chem.MolToSmiles(new_NNAA, isomericSmiles=True, canonical=True, rootedAtAtom=0)
+    bond_atom_indices = [new_smi_rootedAtAtom0] + bond_atom_indices
+    new_data = pd.DataFrame({
+        'SMILES': [new_smi],
+        'TERMINAL': [ter_or_not],
+        'BOND SITES': [bond_atom_indices],
+        'MOL': [new_NNAA]
+    })
+    df = pd.concat([df, new_data], ignore_index=True)
+    # deduplicate by SMILES
+    df = df.drop_duplicates(subset=['SMILES'])
+    return df
+def add_IDs(df):
+    # group df by TAUTOMER HASH
+    tautomer_groups = df['TAUTOMER HASH'].drop_duplicates().reset_index(drop=True)
+    for i, tautomer_hash in enumerate(tautomer_groups):
+        df.loc[df['TAUTOMER HASH'] == tautomer_hash, 'ID'] = f"X{i}"
+    # if ['TERMINAL'] == 'ter', add 'ter' to the ID
+    df.loc[df['TERMINAL'] == 'ter', 'ID'] = df['ID'] + 'ter'
+    return df
+def relabel_NNAA(mol, NNAA_df):
+    visited_Unk_labels, visited_NNAA_labels = [], []
+    for atom_idx in range(mol.GetNumAtoms()):
+        try:
+            label = mol.GetAtomWithIdx(atom_idx).GetProp("AA")
+            if label.startswith("Unk") and label not in visited_Unk_labels:
+                visited_Unk_labels.append(label)
+                rwmol_from_peptide = Chem.RWMol(mol)
+                remove_atoms(rwmol_from_peptide, mol, different_NNAA, Unk_label=label)
+                for idx, NNAA_row in NNAA_df.iterrows():
+                    if perfect_match(rwmol_from_peptide, NNAA_row['MOL']):
+                        nnaa_name = NNAA_row['ID']
+                        seen_times = visited_NNAA_labels.count(nnaa_name)
+                        nnaa_prop = f"{nnaa_name}:{seen_times}"
+                        mol = relabel_prop(mol, label, nnaa_prop)
+                        visited_NNAA_labels.append(nnaa_name)
+                        break
+        except:
+            continue
+    return mol
+def different_NNAA(label, Unk_label):
+    return label != Unk_label
+def relabel_prop(mol, label, nnaa_name):
+    for atom_idx in range(mol.GetNumAtoms()):
+        try:
+            atom = mol.GetAtomWithIdx(atom_idx)
+            if atom.HasProp("AA") and atom.GetProp("AA") == label:
+                atom.SetProp("AA", nnaa_name)
+        except:
+            continue
+    return mol
+def perfect_match(rwmol_NNAA, nnaa_mol):
+    return (
+        rwmol_NNAA.HasSubstructMatch(nnaa_mol, useChirality=True)
+        and nnaa_mol.GetNumAtoms() == rwmol_NNAA.GetNumAtoms()
+    )
+def NNAAs_with_OH_removed(NNAA_df):
+    new_rows = []  # List to store the new rows
+    for _, row in NNAA_df.iterrows():
+        mol = row['MOL']
+        rwmol_NNAA = Chem.RWMol(mol)
+        backbone_indices = rwmol_NNAA.GetSubstructMatches(loose_backbone)
+        for backbone_index in backbone_indices:
+            OH_atom_i = backbone_index[OH_position]
+            rwmol_NNAA.GetAtomWithIdx(OH_atom_i).SetProp("ToBeRemoved", "ToBeRemoved")
+        num_atoms = rwmol_NNAA.GetNumAtoms() - 1
+        while num_atoms >= 0:
+            if rwmol_NNAA.GetAtomWithIdx(num_atoms).HasProp("ToBeRemoved"):
+                rwmol_NNAA.RemoveAtom(num_atoms)
+                result_mol = rwmol_NNAA.GetMol()
+                # Add a new row to new_rows with the same data except for the modified 'MOL'
+                new_row = row.copy()
+                new_row['MOL'] = result_mol
+                new_rows.append(new_row)
+                num_atoms -= 1
+            num_atoms -= 1
+    # Convert new_rows to a DataFrame and concatenate with the original NNAA_df
+    new_rows_df = pd.DataFrame(new_rows)
+    NNAA_df = pd.concat([NNAA_df, new_rows_df], ignore_index=True)
+    return NNAA_df
+def remove_small_substructs(mol):
+    substructures = Chem.GetMolFrags(mol, asMols=True)
+    if len(substructures) <= 1:
+        return mol, False
+    else:
+        error = "Multiple substructures. Removing the smaller ones."
+        substructure_sizes = [sub.GetNumAtoms() for sub in substructures]
+        largest_substructure_index = substructure_sizes.index(max(substructure_sizes))
+        for i in range(len(substructures)):
+            if i != largest_substructure_index:
+                modified_mol = Chem.DeleteSubstructs(mol, substructures[i])
+        return modified_mol, error
+def has_unlabelled_atom(mol, seq_list):
+    if "?" in seq_list:
+        return True
+    for atom in mol.GetAtoms():
+        if not atom.HasProp("AA"):
+            return True
+    return False
+def linear(peptide_bonds, aminos):
+    return len(peptide_bonds) == len(aminos) - 1
+def ter_in_the_middle(seq_list):
+    for i, amino in enumerate(seq_list):
+        if amino.endswith("ter") and i != 0 and i != len(seq_list) - 1:
+            return True
+def filter_out(seq_list, mol, peptide_bonds):
+    if not linear(peptide_bonds, seq_list):
+        return "Not linear"
+    if has_unlabelled_atom(mol, seq_list):
+        return "Has unlabelled atom"
+    if ter_in_the_middle(seq_list):
+        return "Terminal amino acid in the middle"
+    return False
+def record_bond_sites(NNAA):
+    indices = []
+    for atom in NNAA.GetAtoms():
+        if atom.HasProp("bond_site"):
+            indices.append(atom.GetIdx())
+    return indices
+def count_aminos(split_seq, NNAA_counts):
+    for amino in split_seq:
+        # Count the number of times each NNAA is seen in the output sequences
+        if amino.startswith("X"):
+            if amino in NNAA_counts:
+                NNAA_counts[amino] += 1
+            else:
+                NNAA_counts[amino] = 1
+    return NNAA_counts
+def load_data(input_file):
+    # Load the data
+    print("0/4 Loading input data...")
+    df = pd.read_csv(input_file, sep='\t', on_bad_lines='warn')
+    # Check if the 'ID' column exists
+    if 'ID' not in df.columns:
+        df['ID'] = range(1, len(df) + 1)  # Create an 'ID' column with unique sequential numbers
+    # Check if the 'ISOSMILES' column exists
+    if 'ISOSMILES' not in df.columns:
+        df['ISOSMILES'] = None  # Create an empty 'ISOSMILES' column if it doesn't exist
+    # Check if the 'SMILES' column exists
+    if 'SMILES' not in df.columns:
+        df['SMILES'] = None  # Create an empty 'SMILES' column if it doesn't exist
+    # Determine which column to use for the SMILES
+    df['SMILES'] = df['ISOSMILES'].fillna(df['SMILES']).str.strip()
+    # Remove rows where both 'ISOSMILES' and 'SMILES' are missing or empty
+    df = df[df['SMILES'].ne("")]
+    # Drop ISOSMILES column
+    df = df.drop(columns=['ISOSMILES'])
+    # drop rows where 'SMILES' is empty
+    df = df[(df['SMILES'] != '') & (df['SMILES'].notna())]
+    # convert to a dataframe
+    df = pd.DataFrame(df)
+    return df
+def process_molecule_batch(batch_df, smi2mol, ignore_cyclic_peptide, min_amino_acids, progress_bar):
+    local_mol_data = []
+    for mol_index, row in batch_df.iterrows():
+        try:
+            smi = row['SMILES']
+            if not smi:
+                local_mol_data.append((mol_index, None, None, "No SMILES provided", None, None))
+                continue
+            mol = Chem.MolFromSmiles(smi)
+            if mol is None:
+                local_mol_data.append((mol_index, None, None, "Invalid SMILES", None, None))
+                continue
+            mol, error = remove_small_substructs(mol)
+            if error:
+                local_mol_data.append((mol_index, None, None, error, None, None))
+                continue
+            match_AA(mol, smi2mol)
+            peptide_bonds = label_peptide_bonds(mol)
+            if len(peptide_bonds) < min_amino_acids - 1:
+                local_mol_data.append((mol_index, None, None, "Not enough amino acids", None, None))
+                continue
+            num_NNAAs = label_NNAAs(mol, peptide_bonds)
+            all_AA = num_NNAAs == 0
+            label_boundary_bonds(mol)
+            if mol_is_cyclic_peptide(mol, ignore_cyclic_peptide):
+                local_mol_data.append((mol_index, None, None, "Cyclic peptide", None, None))
+                continue
+            NNAAs_info = []
+            if not all_AA:
+                NNAAs = get_NNAAs(mol)
+                if NNAAs == "error":
+                    local_mol_data.append((mol_index, None, None, "Disconnected molecule", None, None))
+                    continue
+                else:
+                    for NNAA in NNAAs:
+                        ter_or_not = detect_terminal(NNAA)
+                        bond_sites = record_bond_sites(NNAA)
+                        NNAAs_info.append((NNAA, ter_or_not, bond_sites))
+            local_mol_data.append((mol_index, mol, all_AA, None, peptide_bonds, NNAAs_info))
+        except:
+            local_mol_data.append((mol_index, None, None, "Unknown error", None, None))
+    progress_bar.update(1)
+    return local_mol_data
+def label_molecules_in_batches(mol_df, batch_size, smi2mol, ignore_cyclic_peptide, min_amino_acids, max_workers):
+    # Initialize columns and dataframes
+    mol_df[['ERROR', 'MOL', 'ALL AA', 'PEPTIDE BONDS']] = ["", "", False, ""]
+    NNAA_df = pd.DataFrame(columns=['ID', 'SMILES', 'TERMINAL', 'BOND SITES'])
+    indices = list(mol_df.index)
+    batches = [indices[i:i + batch_size] for i in range(0, len(indices), batch_size)]
+    futures = []
+    progress_bar = tqdm(total=len(indices) // batch_size, desc="1/4 Labelling molecules", leave=True)
+    # Use ThreadPoolExecutor for parallel batch processing
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        for batch_indices in batches:
+            batch_df = mol_df.loc[batch_indices]
+            futures.append(
+                executor.submit(process_molecule_batch, batch_df, smi2mol, ignore_cyclic_peptide, min_amino_acids, progress_bar)
+            )
+    progress_bar.close()
+    with tqdm(total=len(mol_df), desc="2/4 Storing NNAAs") as pbar:
+            for future in as_completed(futures):
+                batch_results = future.result()
+                for mol_index, mol, all_AA, error, peptide_bonds, NNAAs_info in batch_results:
+                    if mol is None:
+                        mol_df.at[mol_index, 'ERROR'] = error
+                        continue
+                    mol_df.at[mol_index, 'MOL'] = mol
+                    mol_df.at[mol_index, 'ALL AA'] = all_AA
+                    mol_df.at[mol_index, 'PEPTIDE BONDS'] = peptide_bonds
+                    if NNAAs_info:
+                        for NNAA, ter_or_not, bond_sites in NNAAs_info:
+                            NNAA_df = enlist_NNAA(NNAA, NNAA_df, ter_or_not, bond_sites)
+                pbar.update(len(batch_results))
+    return NNAA_df, mol_df
+def highlight_bonds_with_AA(mol_s):  # with AA colors
+    bond_highlights = defaultdict(lambda: [])
+    for bond in mol_s.GetBonds():
+        atom1_i, atom2_i, prop1, prop2 = get_connected_atoms_and_props(bond, mol_s)
+        if (label_belongs_to_AA(prop1) and prop1 == prop2):  # if the bond is within the same AA
+            bond_highlights[bond.GetIdx()].append(aa2color_dict[prop1[:3]])
+    return bond_highlights
+def relabel_batch(mol_df, NNAA_df):
+    # Initialize a list to collect row data
+    local_mol_data = []
+    for _, row in mol_df.iterrows():
+        mol_index = row['ID']
+        mol = row['MOL']
+        all_AA = row['ALL AA']
+        peptide_bonds = row['PEPTIDE BONDS']
+        try:
+            # Process molecule if not all amino acids are labeled
+            if not all_AA:
+                mol = relabel_NNAA(mol, NNAA_df)
+            # Perform various processing tasks
+            bond_highlights = highlight_bonds_with_AA(mol)
+            peptide_bonded_props, peptide_bonded_atoms = search_terminal_AA(mol)
+            first_atom_index = get_first_atom_index(mol, peptide_bonded_props, peptide_bonded_atoms)
+            aa_list = reorder_AAs(mol, first_atom_index)
+            split_seq = write_seq(aa_list)
+            seq = "".join(split_seq)
+            error = filter_out(split_seq, mol, peptide_bonds)
+            if error:
+                seq = ""
+        except Exception as e:
+            error = str(e)  # Ensure error is a string
+            seq = ""
+        # Collect data in a list of dictionaries
+        local_mol_data.append({'ID': mol_index, 'SEQUENCE': seq, 'ERROR': error, 'BOND HIGHLIGHTS': bond_highlights})
+    return pd.DataFrame(local_mol_data)
+def relabel_batches(mol_df, NNAA_df, batch_size):
+    # Check if NNAA_df is empty
+    if NNAA_df.empty:
+        print("Warning: NNAA_df is empty. No NNAAs to process.")
+    # Ensure NNAA_df has an index
+    if NNAA_df.index.empty:
+        NNAA_df = NNAA_df.reset_index(drop=True)
+    mol_df['BOND HIGHLIGHTS'] = ""
+    mol_df_copy = mol_df[mol_df['MOL'] != ""].copy()
+    indices = list(mol_df_copy.index)
+    def process_batch(batch_indices):
+        batch_df = mol_df_copy.loc[batch_indices]
+        return relabel_batch(batch_df, NNAA_df)
+    with ThreadPoolExecutor() as executor:
+        futures = []
+        for i in range(0, len(indices), batch_size):
+            batch_indices = indices[i:i + batch_size]
+            futures.append(executor.submit(process_batch, batch_indices))
+        local_mol_df = mol_df.copy()
+        for future in tqdm(as_completed(futures), total=len(futures), desc="4/4 Relabelling mols"):
+            mol_dataset_per_batch = future.result()
+            for _, row in mol_dataset_per_batch.iterrows():
+                local_mol_df.loc[local_mol_df['ID'] == row['ID'], ['SEQUENCE', 'ERROR', 'BOND HIGHLIGHTS']] = row[['SEQUENCE', 'ERROR', 'BOND HIGHLIGHTS']].values
+    return local_mol_df
+def output_NNAA(NNAA_df, output_dir):
+    # Drop the 'MOL' column
+    NNAA_df = NNAA_df.drop(columns=['MOL'])
+    NNAA_df['TAUTOMERS'] = None
+    # Add 'COUNT' by 'TAUTOMER HASH' group and deduplicate by 'TAUTOMER HASH'
+    NNAA_df = NNAA_df.groupby('TAUTOMER HASH').agg(
+        ID=('ID', 'first'),
+        SMILES=('SMILES', 'first'),
+        TAUTOMERS=('SMILES', lambda x: ','.join(x.unique())),
+        TERMINAL=('TERMINAL', 'first'),
+        BOND_SITES=('BOND SITES', 'first'),
+    ).reset_index().drop_duplicates(subset='TAUTOMER HASH', keep='first')
+    NNAA_df = NNAA_df.drop(columns=['TAUTOMER HASH'])
+    print(output_dir)
+    NNAA_df.to_csv(os.path.join(output_dir, "raw/ncAAs_raw.txt"), sep='\t', index=False)
+def output_mols(mol_df, output_dir, draw):
+    if draw:
+        drawer = MoleculeDrawer(output_dir)
+        def safe_draw(row):
+            try:
+                drawer.draw_input_mol(row['MOL'], row['ID'], row['SEQUENCE'], row['BOND HIGHLIGHTS'])
+            except Exception as e:
+                return None  # Return None to effectively ignore this row
+        # Apply the safe drawing function to each row
+        mol_df.apply(lambda row: safe_draw(row), axis=1)
+    mol_df.drop(columns=['MOL', 'PEPTIDE BONDS'], inplace=True)
+    # bring 'SEQUENCE' column next to 'ID'
+    cols = ['ID', 'SEQUENCE'] + [col for col in mol_df.columns if col not in ['ID', 'SEQUENCE']]
+    mol_df = mol_df[cols]
+    mol_df.to_csv(os.path.join(output_dir, "raw/sequences_raw.txt"), sep='\t', index=False)
+def get_rdkit_tautomer_hash(smi):
+    mol = Chem.MolFromSmiles(smi)
+    if mol is None:
+        return None
+    layers = RegistrationHash.GetMolLayers(mol)
+    return layers[HashLayer.TAUTOMER_HASH]
+def main():
+    mol_df = load_data(input_file)
+    NNAA_df, mol_df = label_molecules_in_batches(mol_df, batch_size, smi2mol, ignore_cyclic_peptide, min_amino_acids, max_workers)
+    NNAA_df['TAUTOMER HASH'] = NNAA_df['SMILES'].apply(get_rdkit_tautomer_hash)
+    NNAA_df = NNAAs_with_OH_removed(NNAA_df)
+    NNAA_df = add_IDs(NNAA_df)
+    mol_df = relabel_batches(mol_df, NNAA_df, batch_size)
+    output_NNAA(NNAA_df, output_dir)
+    output_mols(mol_df, output_dir, draw)
+if __name__ == '__main__':
+    args = parse_arguments()
+    input_file = args.input_file
+    ignore_cyclic_peptide = not args.process_cyclic
+    min_amino_acids = args.min_amino_acids
+    batch_size = args.batch_size
+    output_dir = args.output_dir
+    max_workers = args.max_workers
+    draw = args.draw
+    os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(os.path.join(output_dir, "raw"), exist_ok=True)
+    main()

src/prepare_GPepT_data.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import random
+import argparse
+import pandas as pd
+import os
+# Set up argument parser
+parser = argparse.ArgumentParser(description="Process sequences from an input file and split them into two output files.")
+parser.add_argument('--output_dir', type=str, default='output/tmp', help="Directory containing the input file")
+args = parser.parse_args()
+# Define input file and output file paths
+input_file = os.path.join(args.output_dir, 'standard/sequences_standardized.txt')
+os.makedirs(os.path.join(args.output_dir, 'for_GPepT'), exist_ok=True)
+output_file_90 = os.path.join(args.output_dir, 'for_GPepT/train90.txt')
+output_file_10 = os.path.join(args.output_dir, 'for_GPepT/val10.txt')
+# Check if the input file exists
+if not os.path.exists(input_file):
+    # No ncAAs?
+    input_file = os.path.join(args.output_dir, 'raw/sequences_raw.txt')
+    if not os.path.exists(input_file):
+        print(f"Error: The input file '{input_file}' does not exist.")
+        exit(1)
+# Read the input file into a pandas DataFrame
+df = pd.read_csv(input_file, sep='\t')
+# Extract sequences and add <endoftext> to each
+sequences = df['SEQUENCE'].apply(lambda x: x + '<|endoftext|>')
+# Shuffle the sequences to randomize the split
+sequences = sequences.sample(frac=1, random_state=42).reset_index(drop=True)
+# Split the sequences into 90% and 10%
+split_index = int(0.9 * len(sequences))
+sequences_90 = sequences[:split_index]
+sequences_10 = sequences[split_index:]
+# Write the sequences to the output files
+sequences_90.to_csv(output_file_90, index=False, header=False)
+sequences_10.to_csv(output_file_10, index=False, header=False)
+print(f"Data has been successfully split into {output_file_90} and {output_file_10}")

src/standardizer.py ADDED Viewed

	@@ -0,0 +1,98 @@

+#!/usr/bin/env python3
+import pandas as pd
+import re
+import argparse
+import os
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Standardize non-canonical amino acids (ncAAs) and sequences.")
+    parser.add_argument("--output_dir", default='output/tmp', help="Directory to save output files.")
+    return parser.parse_args()
+def main():
+    args = parse_arguments()
+    output_dir = args.output_dir
+    # Ensure output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(os.path.join(output_dir, 'standard'), exist_ok=True)
+    # Paths for input files
+    standard_ncAAs_file = 'dictionary.txt'
+    raw_ncAAs_file = os.path.join(output_dir, 'raw/ncAAs_raw.txt')
+    sequence_file = os.path.join(output_dir, 'raw/sequences_raw.txt')
+    # Paths for output files
+    id_mapping_output = os.path.join(output_dir, 'nc_raw2standard.txt')
+    relabeled_ncAAs_output = os.path.join(output_dir, 'standard/nc_standardized.txt')
+    relabeled_sequence_output = os.path.join(output_dir, 'standard/sequences_standardized.txt')
+    try:
+        # Load the analysis DataFrame
+        standard_ncAAs = pd.read_csv(standard_ncAAs_file, sep='\t')
+        raw_ncAAs = pd.read_csv(raw_ncAAs_file, sep='\t')
+        # Remove rows whose 'ID' does not start with 'X'
+        raw_ncAAs = raw_ncAAs[raw_ncAAs['ID'].str.startswith('X')]
+        # Dictionary to store old and new IDs
+        id_map = {}
+        # Function to relabel IDs of the current_ncAAs DataFrame according to the standard_ncAAs DataFrame ID with the same SMILES
+        def relabel_id(row):
+            old_id = row['ID']
+            # Find the row in standard_ncAAs with the same SMILES
+            match = standard_ncAAs[standard_ncAAs['SMILES'] == row['SMILES']]
+            if not match.empty:
+                new_id = match['ID'].values[0]
+                id_map[old_id] = new_id  # Record old and new ID mapping
+                return new_id
+            else:
+                return "[UNK]"
+        # Apply the function to relabel IDs and store old-new ID mappings
+        raw_ncAAs['ID'] = raw_ncAAs.apply(relabel_id, axis=1)
+        # Save the ID mapping
+        id_map_df = pd.DataFrame(list(id_map.items()), columns=['raw_ID', 'standard_ID'])
+        id_map_df.to_csv(id_mapping_output, sep='\t', index=False)
+        raw_ncAAs.to_csv(relabeled_ncAAs_output, sep='\t', index=False)
+        # Load the sequence file
+        sequence_df = pd.read_csv(sequence_file, sep='\t')
+        # Drop rows whose 'SEQUENCE' is NaN
+        sequence_df = sequence_df.dropna(subset=['SEQUENCE'])
+        # Function to apply the relabeling in the SEQUENCE column
+        def relabel_sequence(sequence):
+            # Split the sequence by capital letters, which separates each ID
+            tokens = re.split(r"(?=[A-Z])", sequence)
+            # Replace each token if it matches an old ID in the map
+            relabeled_tokens = [id_map.get(token, token) for token in tokens]
+            # If '[NA]' is in the relabeled tokens, return an empty string
+            if '[UNK]' in relabeled_tokens:
+                return ''
+            # Reassemble the sequence
+            return ''.join(relabeled_tokens)
+        # Apply relabeling to each sequence
+        sequence_df['SEQUENCE'] = sequence_df['SEQUENCE'].apply(relabel_sequence)
+        # Save the relabeled sequences
+        sequence_df.to_csv(relabeled_sequence_output, sep='\t', index=False)
+        print("Relabeling complete.")
+        print(f"ID mapping saved to: {id_mapping_output}")
+        print(f"Relabeled ncAAs saved to: {relabeled_ncAAs_output}")
+        print(f"Relabeled sequences saved to: {relabeled_sequence_output}")
+    except Exception as e:
+        print(f"No ncAAs found.")
+if __name__ == "__main__":
+    main()