yzhang@u.duke.nus.edu commited on
Commit
4868d91
·
1 Parent(s): 432a60b

add sequence 2 smiles feature

Browse files
Files changed (2) hide show
  1. aminoacid_selective.py +557 -0
  2. app.py +412 -36
aminoacid_selective.py ADDED
@@ -0,0 +1,557 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """Definitions and properties of amino-acids for p2smi"""
3
+
4
+ # Natural Amino-acids:
5
+
6
+ specific_aminos = {
7
+ "Fmoc-Aib-OH": {
8
+ "Code": "Aib",
9
+ "Formula": "C28H29NO5",
10
+ "Letter": "Ŷ",
11
+ "MolWeight": 221.141578848,
12
+ "SMILES": "CC(C)(N)C(=O)O",
13
+ "cterm": "NC(C)(C)C(=O)[*:2]",
14
+ "disulphide": False,
15
+ "ester": False,
16
+ "nterm": "[*:1]NC(C)(C)C(=O)O",
17
+ },
18
+ "Fmoc-Asp(OtBu)-(Dmb)Gly-OH": {
19
+ "Code": "Dtg",
20
+ "Formula": "C28H29NO5",
21
+ "Letter": "Ĝ",
22
+ "MolWeight": 221.141578848,
23
+ "SMILES": "N[C@@H](CC(=O)OC(C)(C)C)C(=O)N(CC1=C(C=C(C=C1)OC)OC)CC(=O)O",
24
+ "nterm": "[*:1]N[C@@H](CC(=O)OC(C)(C)C)C(=O)N(CC1=C(C=C(C=C1)OC)OC)CC(=O)O",
25
+ "cterm": "N[C@@H](CC(=O)OC(C)(C)C)C(=O)N(CC1=C(C=C(C=C1)OC)OC)CC(=O)[*:2]",
26
+ "disulphide": False,
27
+ "ester": False,
28
+ },
29
+ "Fmoc-Cys(Mmt)-OH": {
30
+ "Code": "Cmt",
31
+ "Formula": "C28H29NO5",
32
+ "Letter": "Ĉ",
33
+ "MolWeight": 221.141578848,
34
+ "SMILES": "COC1=CC=C(C=C1)C(C2=CC=CC=C2)(C3=CC=CC=C3)SC[C@@H](N)C(=O)O",
35
+ "nterm": "N([*:1])[C@@H](CSC(C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=C(OC)C=C3)C(=O)O",
36
+ "cterm": "N[C@@H](CSC(C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=C(OC)C=C3)C(=O)[*:2]",
37
+ "disulphide": False,
38
+ "ester": False,
39
+ },
40
+ "Fmoc-Glu(OAll)-OH": {
41
+ "Code": "Eal",
42
+ "Formula": "C28H29NO5",
43
+ "Letter": "Ė",
44
+ "MolWeight": 221.141578848,
45
+ "SMILES": "C=CCOC(=O)CC[C@@H](N)C(=O)O",
46
+ "nterm": "[*:1]N[C@@H](CCC(=O)OCC=C)C(=O)O",
47
+ "cterm": "N[C@@H](CCC(=O)OCC=C)C(=O)[*:2]",
48
+ "disulphide": False,
49
+ "ester": False,
50
+ },
51
+ "Fmoc-Lys(palmitoyl-Glu-OtBu)-OH": {
52
+ "Code": "Kpg",
53
+ "Formula": "C28H29NO5",
54
+ "Letter": "Ƙ",
55
+ "MolWeight": 221.141578848,
56
+ "SMILES": "N[C@@H](CCCNC(=O)CCC[C@@H](NC(=O)CCCCCCCCCCCCCCCC)C(=O)OC(C)(C)C)C(=O)O",
57
+ "nterm": "[*:1]N[C@@H](CCCN(C(=O)CCC[C@@H](NC(=O)CCCCCCCCCCCCCCCC)C(=O)OC(C)(C)C))C(=O)O",
58
+ "cterm": "N[C@@H](CCCN(C(=O)CCC[C@@H](NC(=O)CCCCCCCCCCCCCCCC)C(=O)OC(C)(C)C))C(=O)[*:2]",
59
+ "disulphide": False,
60
+ "ester": False,
61
+ },
62
+ "Fmoc-Thr(PO(OBzl)OH)-OH": {
63
+ "Code": "Tpb",
64
+ "Formula": "C28H29NO5",
65
+ "Letter": "Ṯ",
66
+ "MolWeight": 221.141578848,
67
+ "SMILES": "N[C@@H]([C@H](C)OP(=O)(O)OCc1ccccc1)C(=O)O",
68
+ "disulphide": False,
69
+ "ester": False,
70
+ "nterm": "[*:1]N[C@@H]([C@H](C)OP(=O)(O)OCC1=CC=CC=C1)C(=O)O",
71
+ "cterm": "N[C@@H]([C@H](C)OP(=O)(O)OCC1=CC=CC=C1)C(=O)[*:2]"
72
+ },
73
+ "Fmoc-Cycloleucine": {
74
+ "Code": "Cyl",
75
+ "Formula": "C28H29NO5",
76
+ "Letter": "Ċ",
77
+ "MolWeight": 221.141578848,
78
+ "SMILES": "NC1(CCCC1)C(=O)O",
79
+ "nterm": "[*:1]NC1(CCCC1)C(=O)O",
80
+ "cterm": "NC1(CCCC1)C(=O)[*:2]",
81
+ "disulphide": False,
82
+ "ester": False,
83
+ },
84
+ "Fmoc-N-Me-Ala-OH": {
85
+ "Code": "Nma",
86
+ "Formula": "C28H29NO5",
87
+ "Letter": "Ṃ",
88
+ "MolWeight": 221.141578848,
89
+ "SMILES": "CN([C@@H](C)C(=O)O)",
90
+ "cterm": "N(C)[C@@H](C)C(=O)[*:2]",
91
+ "disulphide": False,
92
+ "ester": False,
93
+ "nterm": "[*:1]N(C)[C@@H](C)C(=O)O",
94
+ },
95
+ "Fmoc-N-Me-Leu-OH": {
96
+ "Code": "Nml",
97
+ "Formula": "C28H29NO5",
98
+ "Letter": "Ŀ",
99
+ "MolWeight": 221.141578848,
100
+ "SMILES": "CN([C@@H](CC(C)C))C(=O)O",
101
+ "cterm": "CN([C@@H](CC(C)C)C(=O))[*:2]",
102
+ "disulphide": False,
103
+ "ester": False,
104
+ "nterm": "[*:1]N(C)[C@@H](CC(C)C)C(=O)O",
105
+ },
106
+ "Fmoc-Nle-OH": {
107
+ "Code": "Nle",
108
+ "Formula": "C28H29NO5",
109
+ "Letter": "Ł",
110
+ "MolWeight": 221.141578848,
111
+ "SMILES": "N[C@@H](CCCC)C(=O)O",
112
+ "nterm": "[*:1]N[C@@H](CCCC)C(=O)O",
113
+ "cterm": "N[C@@H](CCCC)C(=O)[*:2]",
114
+ "disulphide": False,
115
+ "ester": False,
116
+ },
117
+ "N-Fmoc-L-homophenylalanine": {
118
+ "Code": "Hph",
119
+ "Formula": "C28H29NO5",
120
+ "Letter": "Ĥ",
121
+ "MolWeight": 221.141578848,
122
+ "SMILES": "N[C@@H](CCC1=CC=CC=C1)C(=O)O",
123
+ "nterm": "[*:1]N[C@@H](CCC1=CC=CC=C1)C(=O)O",
124
+ "cterm": "N[C@@H](CCC1=CC=CC=C1)C(=O)[*:2]",
125
+ "disulphide": False,
126
+ "ester": False,
127
+ },
128
+ "Glycine": {
129
+ "Code": "Gly",
130
+ "Formula": "C2H5NO2",
131
+ "Letter": "G",
132
+ "MolWeight": "75.07",
133
+ "SMILES": "NCC(=O)O",
134
+ "cterm": False,
135
+ "disulphide": False,
136
+ "ester": False,
137
+ "nterm": False,
138
+ },
139
+ "L-Alanine": {
140
+ "Code": "Ala",
141
+ "Formula": "C3H7NO2",
142
+ "Letter": "A",
143
+ "MolWeight": "89.09",
144
+ "SMILES": "N[C@@H](C)C(=O)O",
145
+ "cterm": False,
146
+ "disulphide": False,
147
+ "ester": False,
148
+ "nterm": False,
149
+ },
150
+ "L-Arginine": {
151
+ "Code": "Arg",
152
+ "Formula": "C6H14N4O2",
153
+ "Letter": "R",
154
+ "MolWeight": "174.20",
155
+ "SMILES": "N[C@@H](CCCNC(=N)N)C(=O)O",
156
+ "cterm": False,
157
+ "disulphide": False,
158
+ "ester": False,
159
+ "nterm": "N[C@@H](CCCNC(=N*)N)C(=O)O",
160
+ },
161
+ "L-Asparagine": {
162
+ "Code": "Asn",
163
+ "Formula": "C4H8N2O3",
164
+ "Letter": "N",
165
+ "MolWeight": "132.12",
166
+ "SMILES": "N[C@@H](CC(=O)N)C(=O)O",
167
+ "cterm": False,
168
+ "disulphide": False,
169
+ "ester": False,
170
+ "nterm": "N[C@@H](CC(=O)N*)C(=O)O",
171
+ },
172
+ "L-Aspartic_Acid": {
173
+ "Code": "Asp",
174
+ "Formula": "C4H7NO4",
175
+ "Letter": "D",
176
+ "MolWeight": "133.10",
177
+ "SMILES": "N[C@@H](CC(=O)O)C(=O)O",
178
+ "cterm": "N[C@@H](CC*(=O))C(=O)O",
179
+ "disulphide": False,
180
+ "ester": False,
181
+ "nterm": False,
182
+ },
183
+ "L-Cysteine": {
184
+ "Code": "Cys",
185
+ "Formula": "C3H7NO2S",
186
+ "Letter": "C",
187
+ "MolWeight": "121.16",
188
+ "SMILES": "N[C@@H](CS)C(=O)O",
189
+ "cterm": False,
190
+ "disulphide": "N[C@@H](CS*)C(=O)O",
191
+ "ester": False,
192
+ "nterm": False,
193
+ },
194
+ "L-Glutamic_Acid": {
195
+ "Code": "Glu",
196
+ "Formula": "C5H9NO4",
197
+ "Letter": "E",
198
+ "MolWeight": "147.13",
199
+ "SMILES": "N[C@@H](CCC(=O)O)C(=O)O",
200
+ "cterm": "N[C@@H](CCC*(=O))C(=O)O",
201
+ "disulphide": False,
202
+ "ester": False,
203
+ "nterm": False,
204
+ },
205
+ "L-Glutamine": {
206
+ "Code": "Gln",
207
+ "Formula": "C5H10N2O3",
208
+ "Letter": "Q",
209
+ "MolWeight": "146.15",
210
+ "SMILES": "N[C@@H](CCC(=O)N)C(=O)O",
211
+ "cterm": False,
212
+ "disulphide": False,
213
+ "ester": False,
214
+ "nterm": "N[C@@H](CCC(=O)N*)C(=O)O",
215
+ },
216
+ "L-Histidine": {
217
+ "Code": "His",
218
+ "Formula": "C6H9N3O2",
219
+ "Letter": "H",
220
+ "MolWeight": "155.16",
221
+ "SMILES": "N[C@@H](CC1=CNC=N1)C(=O)O",
222
+ "cterm": False,
223
+ "disulphide": False,
224
+ "ester": False,
225
+ "nterm": False,
226
+ },
227
+ "L-Isoleucine": {
228
+ "Code": "Ile",
229
+ "Formula": "C6H13NO2",
230
+ "Letter": "I",
231
+ "MolWeight": "131.18",
232
+ "SMILES": "N[C@@H]([C@H](CC)C)C(=O)O",
233
+ "cterm": False,
234
+ "disulphide": False,
235
+ "ester": False,
236
+ "nterm": False,
237
+ },
238
+ "L-Leucine": {
239
+ "Code": "Leu",
240
+ "Formula": "C6H13NO2",
241
+ "Letter": "L",
242
+ "MolWeight": "131.18",
243
+ "SMILES": "N[C@@H](CC(C)C)C(=O)O",
244
+ "cterm": False,
245
+ "disulphide": False,
246
+ "ester": False,
247
+ "nterm": False,
248
+ },
249
+ "L-Lysine": {
250
+ "Code": "Lys",
251
+ "Formula": "C6H12N2O2",
252
+ "Letter": "K",
253
+ "MolWeight": "146.19",
254
+ "SMILES": "N[C@@H](CCCCN)C(=O)O",
255
+ "cterm": False,
256
+ "disulphide": False,
257
+ "ester": False,
258
+ "nterm": "N[C@@H](CCCCN*)C(=O)O",
259
+ },
260
+ "L-Methionine": {
261
+ "Code": "Met",
262
+ "Formula": "C5H11NO2S",
263
+ "Letter": "M",
264
+ "MolWeight": "149.21",
265
+ "SMILES": "N[C@@H](CCSC)C(=O)O",
266
+ "cterm": False,
267
+ "disulphide": False,
268
+ "ester": False,
269
+ "nterm": False,
270
+ },
271
+ "L-Phenylalanine": {
272
+ "Code": "Phe",
273
+ "Formula": "C9H11NO2",
274
+ "Letter": "F",
275
+ "MolWeight": "165.19",
276
+ "SMILES": "N[C@@H](Cc1ccccc1)C(=O)O",
277
+ "cterm": False,
278
+ "disulphide": False,
279
+ "ester": False,
280
+ "nterm": False,
281
+ },
282
+ "L-Proline": {
283
+ "Code": "Pro",
284
+ "Formula": "C5H9NO2",
285
+ "Letter": "P",
286
+ "MolWeight": "115.13",
287
+ "SMILES": "N1[C@@H](CCC1)C(=O)O",
288
+ "cterm": False,
289
+ "disulphide": False,
290
+ "ester": False,
291
+ "nterm": False,
292
+ },
293
+ "L-Serine": {
294
+ "Code": "Ser",
295
+ "Formula": "C3H7NO2",
296
+ "Letter": "S",
297
+ "MolWeight": "105.09",
298
+ "SMILES": "N[C@@H](CO)C(=O)O",
299
+ "cterm": False,
300
+ "disulphide": False,
301
+ "ester": "N[C@@H](CO*)C(=O)O",
302
+ "nterm": False,
303
+ },
304
+ "L-Threonine": {
305
+ "Code": "Thr",
306
+ "Formula": "C4H9NO3",
307
+ "Letter": "T",
308
+ "MolWeight": "119.12",
309
+ "SMILES": "N[C@@H]([C@H](O)C)C(=O)O",
310
+ "cterm": False,
311
+ "disulphide": False,
312
+ "ester": "N[C@@H]([C@H](O*)C)C(=O)O",
313
+ "nterm": False,
314
+ },
315
+ "L-Tryptophan": {
316
+ "Code": "Trp",
317
+ "Formula": "C11H12N2O2",
318
+ "Letter": "W",
319
+ "MolWeight": "204.23",
320
+ "SMILES": "N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)O",
321
+ "cterm": False,
322
+ "disulphide": False,
323
+ "ester": False,
324
+ "nterm": False,
325
+ },
326
+ "L-Tyrosine": {
327
+ "Code": "Tyr",
328
+ "Formula": "C9H11NO3",
329
+ "Letter": "Y",
330
+ "MolWeight": "181.19",
331
+ "SMILES": "N[C@@H](Cc1ccc(O)cc1)C(=O)O",
332
+ "cterm": False,
333
+ "disulphide": False,
334
+ "ester": "N[C@@H](Cc1ccc(O*)cc1)C(=O)O",
335
+ "nterm": False,
336
+ },
337
+ "L-Valine": {
338
+ "Code": "Val",
339
+ "Formula": "C5H11NO2",
340
+ "Letter": "V",
341
+ "MolWeight": "117.15",
342
+ "SMILES": "N[C@@H](C(C)C)C(=O)O",
343
+ "cterm": False,
344
+ "disulphide": False,
345
+ "ester": False,
346
+ "nterm": False,
347
+ },
348
+ "D-Alanine": {
349
+ "Code": "ala",
350
+ "Formula": "C3H7NO2",
351
+ "Letter": "a",
352
+ "MolWeight": "89.09",
353
+ "SMILES": "N[C@H](C)C(=O)O",
354
+ "cterm": False,
355
+ "disulphide": False,
356
+ "ester": False,
357
+ "nterm": False,
358
+ },
359
+ "D-Arginine": {
360
+ "Code": "arg",
361
+ "Formula": "C6H14N4O2",
362
+ "Letter": "r",
363
+ "MolWeight": "174.20",
364
+ "SMILES": "N[C@H](CCCNC(=N)N)C(=O)O",
365
+ "cterm": False,
366
+ "disulphide": False,
367
+ "ester": False,
368
+ "nterm": "N[C@H](CCCNC(=N*)N)C(=O)O",
369
+ },
370
+ "D-Asparagine": {
371
+ "Code": "asn",
372
+ "Formula": "C4H8N2O3",
373
+ "Letter": "n",
374
+ "MolWeight": "132.12",
375
+ "SMILES": "N[C@H](CC(=O)N)C(=O)O",
376
+ "cterm": False,
377
+ "disulphide": False,
378
+ "ester": False,
379
+ "nterm": "N[C@H](CC(=O)N*)C(=O)O",
380
+ },
381
+ "D-Aspartic_Acid": {
382
+ "Code": "asp",
383
+ "Formula": "C4H7NO4",
384
+ "Letter": "d",
385
+ "MolWeight": "133.10",
386
+ "SMILES": "N[C@H](CC(=O)O)C(=O)O",
387
+ "cterm": "N[C@H](CC*(=O))C(=O)O",
388
+ "disulphide": False,
389
+ "ester": False,
390
+ "nterm": False,
391
+ },
392
+ "D-Cysteine": {
393
+ "Code": "cys",
394
+ "Formula": "C3H7NO2S",
395
+ "Letter": "c",
396
+ "MolWeight": "121.16",
397
+ "SMILES": "N[C@H](CS)C(=O)O",
398
+ "cterm": False,
399
+ "disulphide": "N[C@H](CS*)C(=O)O",
400
+ "ester": False,
401
+ "nterm": False,
402
+ },
403
+ "D-Glutamic_Acid": {
404
+ "Code": "glu",
405
+ "Formula": "C5H9NO4",
406
+ "Letter": "e",
407
+ "MolWeight": "147.13",
408
+ "SMILES": "N[C@H](CCC(=O)O)C(=O)O",
409
+ "cterm": "N[C@H](CCC*(=O))C(=O)O",
410
+ "disulphide": False,
411
+ "ester": False,
412
+ "nterm": False,
413
+ },
414
+ "D-Glutamine": {
415
+ "Code": "gln",
416
+ "Formula": "C5H10N2O3",
417
+ "Letter": "q",
418
+ "MolWeight": "146.15",
419
+ "SMILES": "N[C@H](CCC(=O)N)C(=O)O",
420
+ "cterm": False,
421
+ "disulphide": False,
422
+ "ester": False,
423
+ "nterm": "N[C@H](CCC(=O)N*)C(=O)O",
424
+ },
425
+ "D-Histidine": {
426
+ "Code": "his",
427
+ "Formula": "C6H9N3O2",
428
+ "Letter": "h",
429
+ "MolWeight": "155.16",
430
+ "SMILES": "N[C@H](CC1=CNC=N1)C(=O)O",
431
+ "cterm": False,
432
+ "disulphide": False,
433
+ "ester": False,
434
+ "nterm": False,
435
+ },
436
+ "D-Isoleucine": {
437
+ "Code": "ile",
438
+ "Formula": "C6H13NO2",
439
+ "Letter": "i",
440
+ "MolWeight": "131.18",
441
+ "SMILES": "N[C@H]([C@@H](CC)C)C(=O)O",
442
+ "cterm": False,
443
+ "disulphide": False,
444
+ "ester": False,
445
+ "nterm": False,
446
+ },
447
+ "D-Leucine": {
448
+ "Code": "leu",
449
+ "Formula": "C6H13NO2",
450
+ "Letter": "l",
451
+ "MolWeight": "131.18",
452
+ "SMILES": "N[C@H](CC(C)C)C(=O)O",
453
+ "cterm": False,
454
+ "disulphide": False,
455
+ "ester": False,
456
+ "nterm": False,
457
+ },
458
+ "D-Lysine": {
459
+ "Code": "lys",
460
+ "Formula": "C6H12N2O2",
461
+ "Letter": "k",
462
+ "MolWeight": "146.19",
463
+ "SMILES": "N[C@H](CCCCN)C(=O)O",
464
+ "cterm": False,
465
+ "disulphide": False,
466
+ "ester": False,
467
+ "nterm": "N[C@H](CCCCN*)C(=O)O",
468
+ },
469
+ "D-Methionine": {
470
+ "Code": "met",
471
+ "Formula": "C5H11NO2S",
472
+ "Letter": "m",
473
+ "MolWeight": "149.21",
474
+ "SMILES": "N[C@H](CCSC)C(=O)O",
475
+ "cterm": False,
476
+ "disulphide": False,
477
+ "ester": False,
478
+ "nterm": False,
479
+ },
480
+ "D-Phenylalanine": {
481
+ "Code": "phe",
482
+ "Formula": "C9H11NO2",
483
+ "Letter": "f",
484
+ "MolWeight": "165.19",
485
+ "SMILES": "N[C@H](Cc1ccccc1)C(=O)O",
486
+ "cterm": False,
487
+ "disulphide": False,
488
+ "ester": False,
489
+ "nterm": False,
490
+ },
491
+ "D-Proline": {
492
+ "Code": "pro",
493
+ "Formula": "C5H9NO2",
494
+ "Letter": "p",
495
+ "MolWeight": "115.13",
496
+ "SMILES": "N1[C@H](CCC1)C(=O)O",
497
+ "cterm": False,
498
+ "disulphide": False,
499
+ "ester": False,
500
+ "nterm": False,
501
+ },
502
+ "D-Serine": {
503
+ "Code": "ser",
504
+ "Formula": "C3H7NO2",
505
+ "Letter": "s",
506
+ "MolWeight": "105.09",
507
+ "SMILES": "N[C@H](CO)C(=O)O",
508
+ "cterm": False,
509
+ "disulphide": False,
510
+ "ester": "N[C@H](CO*)C(=O)O",
511
+ "nterm": False,
512
+ },
513
+ "D-Tryptophan": {
514
+ "Code": "trp",
515
+ "Formula": "C11H12N2O2",
516
+ "Letter": "w",
517
+ "MolWeight": "204.23",
518
+ "SMILES": "N[C@H](CC(=CN2)C1=C2C=CC=C1)C(=O)O",
519
+ "cterm": False,
520
+ "disulphide": False,
521
+ "ester": False,
522
+ "nterm": False,
523
+ },
524
+ "D-Tyrosine": {
525
+ "Code": "tyr",
526
+ "Formula": "C9H11NO3",
527
+ "Letter": "y",
528
+ "MolWeight": "181.19",
529
+ "SMILES": "N[C@H](Cc1ccc(O)cc1)C(=O)O",
530
+ "cterm": False,
531
+ "disulphide": False,
532
+ "ester": "N[C@H](Cc1ccc(O*)cc1)C(=O)O",
533
+ "nterm": False,
534
+ },
535
+ "D-Valine": {
536
+ "Code": "val",
537
+ "Formula": "C5H11NO2",
538
+ "Letter": "v",
539
+ "MolWeight": "117.15",
540
+ "SMILES": "N[C@H](C(C)C)C(=O)O",
541
+ "cterm": False,
542
+ "disulphide": False,
543
+ "ester": False,
544
+ "nterm": False,
545
+ },
546
+ "D-Threonine": {
547
+ "Code": "thr",
548
+ "Formula": "C4H9NO3",
549
+ "Letter": "t",
550
+ "MolWeight": "119.12",
551
+ "SMILES": "N[C@H]([C@@H](O)C)C(=O)O",
552
+ "cterm": False,
553
+ "disulphide": False,
554
+ "ester": "N[C@H]([C@@H](O*)C)C(=O)O",
555
+ "nterm": False,
556
+ },
557
+ }
app.py CHANGED
@@ -24,6 +24,39 @@ from io import BytesIO
24
  import tempfile
25
  from rdkit import Chem
26
  from swisssidechain import all_aminos
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  class PeptideAnalyzer:
29
  def __init__(self):
@@ -71,27 +104,40 @@ class PeptideAnalyzer:
71
  self._build_swisssidechain_lookups()
72
 
73
  def _build_swisssidechain_lookups(self):
74
- """Side chain lookups for SwissSidechain UAAs"""
75
- # Exact SMILES match
76
  self.exact_smiles_lookup = {}
77
-
78
- # Clean SMILES lookup (without stereochemistry)
79
  self.clean_smiles_lookup = {}
80
-
 
 
81
  for uaa_name, uaa_data in all_aminos.items():
82
- code = uaa_data["Code"]
83
- letter = uaa_data["Letter"]
84
- smiles = uaa_data["SMILES"]
85
-
86
- self.three_to_one[code] = letter
87
-
88
- self.exact_smiles_lookup[smiles] = code
89
-
90
- # Clean SMILES (no stereochemistry)
91
- clean_smiles = self._remove_stereochemistry(smiles)
92
- if clean_smiles not in self.clean_smiles_lookup:
93
- self.clean_smiles_lookup[clean_smiles] = []
94
- self.clean_smiles_lookup[clean_smiles].append(code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  def _remove_stereochemistry(self, smiles):
97
  """Remove stereochemistry from SMILES"""
@@ -106,52 +152,61 @@ class PeptideAnalyzer:
106
  return cleaned
107
 
108
  def preprocess_complex_residues(self, smiles):
109
- """Identify and protect complex residues with internal peptide bonds - improved to prevent overlaps"""
110
  complex_positions = []
111
-
112
  for pattern, residue_type in self.complex_residue_patterns:
113
  for match in re.finditer(pattern, smiles):
114
  if not any(pos['start'] <= match.start() < pos['end'] or
115
- pos['start'] < match.end() <= pos['end'] for pos in complex_positions):
116
  complex_positions.append({
117
  'start': match.start(),
118
  'end': match.end(),
119
  'type': residue_type,
120
  'pattern': match.group()
121
  })
122
-
 
 
 
 
 
 
 
 
 
 
 
123
  complex_positions.sort(key=lambda x: x['start'])
124
-
125
  if not complex_positions:
126
  return smiles, []
127
-
128
  preprocessed_smiles = smiles
129
  offset = 0
130
-
131
  protected_residues = []
132
-
133
  for pos in complex_positions:
134
  start = pos['start'] + offset
135
- end = pos['end'] + offset
136
-
137
  complex_part = preprocessed_smiles[start:end]
138
-
 
139
  if not ('[C@H]' in complex_part or '[C@@H]' in complex_part):
140
- continue
141
-
 
 
142
  placeholder = f"COMPLEX_RESIDUE_{len(protected_residues)}"
143
-
144
  preprocessed_smiles = preprocessed_smiles[:start] + placeholder + preprocessed_smiles[end:]
145
-
146
  offset += len(placeholder) - (end - start)
147
-
148
  protected_residues.append({
149
  'placeholder': placeholder,
150
  'type': pos['type'],
151
  'content': complex_part
152
  })
153
-
154
  return preprocessed_smiles, protected_residues
 
155
  def split_on_bonds(self, smiles, protected_residues=None):
156
  """Split SMILES into segments based on peptide bonds, with improved handling of protected residues"""
157
  positions = []
@@ -310,7 +365,11 @@ class PeptideAnalyzer:
310
  def identify_residue(self, segment):
311
  if 'complex_type' in segment:
312
  return segment['complex_type'], []
313
-
 
 
 
 
314
  content = self.clean_terminal_carboxyl(segment)
315
  mods = self.get_modifications(segment)
316
 
@@ -901,6 +960,175 @@ class PeptideStructureGenerator:
901
 
902
  return sio.getvalue().encode('utf-8')
903
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
904
  def process_input(
905
  smiles_input=None,
906
  file_obj=None,
@@ -1045,6 +1273,153 @@ def process_input(
1045
  #structure_files if structure_files else []
1046
  )
1047
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1048
  iface = gr.Interface(
1049
  fn=process_input,
1050
  inputs=[
@@ -1105,6 +1480,7 @@ iface = gr.Interface(
1105
  if __name__ == "__main__":
1106
  iface.launch(share=True)
1107
 
 
1108
  """
1109
  5. Optional linear representation
1110
  6. Optional 3D structure generation (ETKDG and UFF methods)
 
24
  import tempfile
25
  from rdkit import Chem
26
  from swisssidechain import all_aminos
27
+ from aminoacid_selective import specific_aminos
28
+
29
+ def _internal_from_cterm(cterm: str) -> str:
30
+ s = cterm.strip()
31
+ s = re.sub(r'C\(=O\)\[\*:\s*2\]\s*$', '', s) # drop trailing carbonyl anchor
32
+ s = re.sub(r'^\[\*:\s*1\]', '', s) # drop leading anchor
33
+ s = re.sub(r'^\(?N\)?', '', s) # drop leading N
34
+ return s
35
+
36
+ def _internal_from_nterm(nterm: str) -> str:
37
+ s = nterm.strip()
38
+ s = re.sub(r'^\[\*:\s*1\]', '', s) # drop leading anchor
39
+ s = re.sub(r'^\(?N\)?', '', s) # drop leading N
40
+ s = re.sub(r'C\(=O\)O\s*$', '', s) # drop trailing COOH
41
+ return s
42
+
43
+ def _chirality_agnostic_regex(literal_smiles: str) -> re.Pattern:
44
+ """
45
+ Make a regex that matches the literal SMILES but ignores stereo/ring digit specifics.
46
+ - Escapes all chars
47
+ - Makes '@' optional (so [C@@H] / [C@H] / [CH] all match)
48
+ - Allows any ring digit where a digit appears
49
+ """
50
+ esc = re.escape(literal_smiles)
51
+
52
+ # make any '@' optional (two steps to handle @@)
53
+ esc = esc.replace(r'\@\@', r'\@?\@?')
54
+ esc = esc.replace(r'\@', r'\@?')
55
+
56
+ # allow any ring digit(s) where digits appear
57
+ esc = re.sub(r'\\\d+', r'\\d+', esc)
58
+
59
+ return re.compile(esc)
60
 
61
  class PeptideAnalyzer:
62
  def __init__(self):
 
104
  self._build_swisssidechain_lookups()
105
 
106
  def _build_swisssidechain_lookups(self):
 
 
107
  self.exact_smiles_lookup = {}
 
 
108
  self.clean_smiles_lookup = {}
109
+ self.uaa_internal_exact = {}
110
+ self.uaa_internal_patterns = []
111
+
112
  for uaa_name, uaa_data in all_aminos.items():
113
+ code = uaa_data["Code"]
114
+ smiles = uaa_data.get("SMILES", "")
115
+ nterm = uaa_data.get("nterm", "")
116
+ cterm = uaa_data.get("cterm", "")
117
+ letter = uaa_data.get("Letter")
118
+
119
+ # keep existing full-aa lookups
120
+ if smiles:
121
+ self.exact_smiles_lookup[smiles] = code
122
+ clean = self._remove_stereochemistry(smiles)
123
+ self.clean_smiles_lookup.setdefault(clean, []).append(code)
124
+
125
+ internal = ""
126
+ if cterm:
127
+ internal = _internal_from_cterm(cterm)
128
+ elif nterm:
129
+ internal = _internal_from_nterm(nterm)
130
+
131
+ if internal:
132
+ self.exact_smiles_lookup[internal] = code
133
+ clean_int = self._remove_stereochemistry(internal)
134
+ self.clean_smiles_lookup.setdefault(clean_int, []).append(code)
135
+
136
+ self.uaa_internal_exact[code] = internal
137
+ self.uaa_internal_patterns.append((_chirality_agnostic_regex(internal), code))
138
+
139
+ if letter:
140
+ self.three_to_one[code] = letter
141
 
142
  def _remove_stereochemistry(self, smiles):
143
  """Remove stereochemistry from SMILES"""
 
152
  return cleaned
153
 
154
  def preprocess_complex_residues(self, smiles):
 
155
  complex_positions = []
156
+
157
  for pattern, residue_type in self.complex_residue_patterns:
158
  for match in re.finditer(pattern, smiles):
159
  if not any(pos['start'] <= match.start() < pos['end'] or
160
+ pos['start'] < match.end() <= pos['end'] for pos in complex_positions):
161
  complex_positions.append({
162
  'start': match.start(),
163
  'end': match.end(),
164
  'type': residue_type,
165
  'pattern': match.group()
166
  })
167
+
168
+ for rgx, code in getattr(self, 'uaa_internal_patterns', []):
169
+ for match in rgx.finditer(smiles):
170
+ if not any(pos['start'] <= match.start() < pos['end'] or
171
+ pos['start'] < match.end() <= pos['end'] for pos in complex_positions):
172
+ complex_positions.append({
173
+ 'start': match.start(),
174
+ 'end': match.end(),
175
+ 'type': code, # e.g., 'Dtg'
176
+ 'pattern': match.group()
177
+ })
178
+
179
  complex_positions.sort(key=lambda x: x['start'])
 
180
  if not complex_positions:
181
  return smiles, []
182
+
183
  preprocessed_smiles = smiles
184
  offset = 0
 
185
  protected_residues = []
186
+
187
  for pos in complex_positions:
188
  start = pos['start'] + offset
189
+ end = pos['end'] + offset
 
190
  complex_part = preprocessed_smiles[start:end]
191
+
192
+ # keep your stereo sanity check (OK to keep)
193
  if not ('[C@H]' in complex_part or '[C@@H]' in complex_part):
194
+ # Dtg internal often *does* have [C@@H], so it will pass.
195
+ # If you find UAAs without explicit stereo, you may relax this guard.
196
+ pass
197
+
198
  placeholder = f"COMPLEX_RESIDUE_{len(protected_residues)}"
 
199
  preprocessed_smiles = preprocessed_smiles[:start] + placeholder + preprocessed_smiles[end:]
 
200
  offset += len(placeholder) - (end - start)
201
+
202
  protected_residues.append({
203
  'placeholder': placeholder,
204
  'type': pos['type'],
205
  'content': complex_part
206
  })
207
+
208
  return preprocessed_smiles, protected_residues
209
+
210
  def split_on_bonds(self, smiles, protected_residues=None):
211
  """Split SMILES into segments based on peptide bonds, with improved handling of protected residues"""
212
  positions = []
 
365
  def identify_residue(self, segment):
366
  if 'complex_type' in segment:
367
  return segment['complex_type'], []
368
+
369
+ # If this was protected by dynamic UAA shielding
370
+ if segment.get('complex_type') in self.uaa_internal_exact:
371
+ return segment['complex_type'], []
372
+
373
  content = self.clean_terminal_carboxyl(segment)
374
  mods = self.get_modifications(segment)
375
 
 
960
 
961
  return sio.getvalue().encode('utf-8')
962
 
963
+ class PeptideEncoder:
964
+ # map one-letter <-> three-letter
965
+ one_to_three = {
966
+ 'A':'Ala','C':'Cys','D':'Asp','E':'Glu','F':'Phe','G':'Gly','H':'His','I':'Ile',
967
+ 'K':'Lys','L':'Leu','M':'Met','N':'Asn','P':'Pro','Q':'Gln','R':'Arg','S':'Ser',
968
+ 'T':'Thr','V':'Val','W':'Trp','Y':'Tyr',
969
+ 'a':'ala','c':'cys','d':'asp','e':'glu','f':'phe','g':'gly','h':'his','i':'ile',
970
+ 'k':'lys','l':'leu','m':'met','n':'asn','p':'pro','q':'gln','r':'arg','s':'ser',
971
+ 't':'thr','v':'val','w':'trp','y':'tyr'
972
+ }
973
+
974
+ # L-form uses [C@@H], D-form uses [C@H].
975
+ SEG_L = {
976
+ 'Ala': '[C@@H](C)',
977
+ 'Gly': 'C', # your analyzer treats bare 'C' (or 'NC') as Gly in context
978
+ 'Val': '[C@@H](C(C)C)',
979
+ 'Leu': '[C@@H](CC(C)C)',
980
+ 'Ile': '[C@@H]([C@H](C)CC)',
981
+ 'Ser': '[C@@H](CO)',
982
+ 'Thr': '[C@@H]([C@@H](C)O)',
983
+ 'Cys': '[C@@H](CS)',
984
+ 'Met': '[C@@H](CCSC)',
985
+ 'Phe': '[C@@H](Cc1ccccc1)',
986
+ 'Tyr': '[C@@H](Cc1ccc(O)cc1)',
987
+ 'Trp': '[C@@H](Cc1c[nH]c2ccccc12)',
988
+ 'His': '[C@@H](Cc1c[nH]cn1)',
989
+ 'Asp': '[C@@H](CC(=O)O)',
990
+ 'Glu': '[C@@H](CCC(=O)O)',
991
+ 'Asn': '[C@@H](CC(=O)N)',
992
+ 'Gln': '[C@@H](CCC(=O)N)',
993
+ 'Lys': '[C@@H](CCCCN)',
994
+ 'Arg': '[C@@H](CCCNC(=N)N)',
995
+ 'Pro': 'CC[C@H]2CN2' # only used if not doing ring-number closure
996
+ }
997
+ # D-forms: flip chirality tag to [C@H]
998
+ SEG_D = {k.lower(): v.replace('[C@@H]', '[C@H]').replace('[C@H]2','[C@@H]2') for k, v in SEG_L.items()}
999
+
1000
+ UAA_SEG = {
1001
+ 'Aib': 'C(C)(C)', # alpha,alpha-dimethyl gly (detected as Aib when bracketed by peptide bonds)
1002
+ 'Nle': '[C@@H](CCCC)', # norleucine ~ Lys w/o terminal amine
1003
+ 'Hph': '[C@@H](CCc1ccccc1)', # homophenylalanine
1004
+ 'Cyl': 'C1(CCCC1)', # cycloleucine
1005
+
1006
+ }
1007
+
1008
+ def __init__(self):
1009
+ self.ssc_code_to_internal = {}
1010
+ for name, data in specific_aminos.items():
1011
+ code = data["Code"]
1012
+ cterm = data.get("cterm", "")
1013
+ nterm = data.get("nterm", "")
1014
+ internal = ""
1015
+ if cterm:
1016
+ internal = _internal_from_cterm(cterm)
1017
+ elif nterm:
1018
+ internal = _internal_from_nterm(nterm)
1019
+ if internal:
1020
+ self.ssc_code_to_internal[code] = internal
1021
+ for name, data in all_aminos.items():
1022
+ code = data["Code"]
1023
+ cterm = data.get("cterm", "")
1024
+ nterm = data.get("nterm", "")
1025
+ internal = ""
1026
+ if cterm:
1027
+ internal = _internal_from_cterm(cterm)
1028
+ elif nterm:
1029
+ internal = _internal_from_nterm(nterm)
1030
+ if internal:
1031
+ self.ssc_code_to_internal[code] = internal
1032
+
1033
+ def _segment_for(self, code):
1034
+ if code in self.SEG_L: return self.SEG_L[code]
1035
+ if code in self.SEG_D: return self.SEG_D[code]
1036
+ if code in self.UAA_SEG: return self.UAA_SEG[code]
1037
+
1038
+ if code in self.ssc_code_to_internal:
1039
+ return self.ssc_code_to_internal[code]
1040
+
1041
+ cap = code[:1].upper() + code[1:].lower()
1042
+ if cap in self.SEG_L: return self.SEG_L[cap]
1043
+ raise ValueError(f"Unknown residue code: {code}")
1044
+
1045
+ def _is_one_letter_seq(self, seq: str) -> bool:
1046
+ """Check if the input string looks like a one-letter code sequence."""
1047
+ if "-" not in seq:
1048
+ return True
1049
+
1050
+ def _norm_token(self, tok):
1051
+ """Normalize tokens like 'A', 'a', 'Ala', 'ala', 'Ala(N-Me)' -> (code, n_me_flag)"""
1052
+ n_me = False
1053
+ tok = tok.strip()
1054
+ if tok in self.one_to_three:
1055
+ base = self.one_to_three[tok]
1056
+ else:
1057
+ m = re.match(r'^([A-Za-z\-]+)(\((.*?)\))?$', tok)
1058
+ if not m:
1059
+ return tok, n_me
1060
+ base = m.group(1)
1061
+ mods = m.group(3) or ""
1062
+ if 'N-Me' in mods or 'Nme' in mods or 'NME' in mods:
1063
+ n_me = True
1064
+ return base, n_me
1065
+
1066
+ def _bond_for(self, n_me=False, pro_ring=False, ring_idx=1):
1067
+ """Return the INTER-RESIDUE bond token your parser recognizes."""
1068
+ if pro_ring:
1069
+ return f'C(=O)N{ring_idx}'
1070
+ return 'N(C)C(=O)' if n_me else 'NC(=O)'
1071
+
1072
+ def _split_tokens(self, seq):
1073
+ if isinstance(seq, (list, tuple)):
1074
+ return list(seq)
1075
+ seq = seq.strip()
1076
+
1077
+ if self._is_one_letter_seq(seq):
1078
+ return list(seq)
1079
+
1080
+ import re
1081
+ return [t for t in re.split(r'-(?![^()]*\))', seq) if t]
1082
+
1083
+
1084
+ def encode(self, seq, cyclic=False, use_proline_ring=True):
1085
+ """
1086
+ Encode a peptide to a SMILES string using the same grammar your analyzer expects.
1087
+
1088
+ Args:
1089
+ seq: list of tokens or a string like:
1090
+ 'Ala-Gly-Phe', 'A-G-F', 'Ala(N-Me)-Leu-Ser', 'Aib-Nle-Arg'
1091
+ D-forms: 'ala-gly', or 'a-g'
1092
+ cyclic: if True, connect C-terminus back to N-terminus (macrocycle)
1093
+ use_proline_ring: if True, do ring-number closure for Pro (N{digit} ... [C@H]{digit})
1094
+ """
1095
+ toks = self._split_tokens(seq)
1096
+ res, mods = [], []
1097
+ for t in toks:
1098
+ base, n_me = self._norm_token(t) # your existing parser for "(N-Me)"
1099
+ res.append(base)
1100
+ mods.append(n_me)
1101
+
1102
+ # Build segments
1103
+ segs = [self._segment_for(r) for r in res]
1104
+
1105
+ # Proline ring bookkeeping
1106
+ # We only do the special N{digit}...{digit} closure when a bond *into* Pro occurs.
1107
+ bonds = []
1108
+ for i in range(len(segs)-1):
1109
+ next_is_pro = res[i+1] in ('Pro','pro')
1110
+ if use_proline_ring and next_is_pro:
1111
+ bonds.append(self._bond_for(n_me=mods[i], pro_ring=True, ring_idx=1))
1112
+ # Make the Pro segment end with the matching ring digit
1113
+ segs[i+1] = 'CCC[C@H]1' if res[i+1]=='Pro' else 'CCC[C@@H]1'
1114
+ else:
1115
+ bonds.append(self._bond_for(n_me=mods[i], pro_ring=False))
1116
+
1117
+ # Assemble linear chain
1118
+ # [segment0] + bond0 + [segment1] + bond1 + ... + [segmentN-1] + C(=O)O
1119
+ out = []
1120
+ for i, s in enumerate(segs):
1121
+ out.append(s)
1122
+ if i < len(bonds):
1123
+ out.append(bonds[i])
1124
+ if cyclic:
1125
+ # TODO
1126
+ pass
1127
+ else:
1128
+ out.append('C(=O)O')
1129
+
1130
+ return ''.join(out)
1131
+
1132
  def process_input(
1133
  smiles_input=None,
1134
  file_obj=None,
 
1273
  #structure_files if structure_files else []
1274
  )
1275
 
1276
+ def process_sequence_to_smiles(
1277
+ seq_input: str,
1278
+ show_segment_details: bool = False,
1279
+ use_proline_ring: bool = True,
1280
+ cyclic: bool = False
1281
+ ):
1282
+ """
1283
+ Encode a peptide sequence to SMILES, then analyze back with PeptideAnalyzer for round-trip.
1284
+ """
1285
+ if not seq_input or not seq_input.strip():
1286
+ return "Please enter a peptide sequence.", None, None
1287
+
1288
+ try:
1289
+ enc = PeptideEncoder() # make sure this class is defined in your file
1290
+ smiles = enc.encode(seq_input.strip(), cyclic=cyclic, use_proline_ring=use_proline_ring)
1291
+
1292
+ analyzer = PeptideAnalyzer()
1293
+ # pre-check it's a peptide
1294
+ if not analyzer.is_peptide(smiles):
1295
+ return "Internal error: generated SMILES did not look like a peptide.", None, None
1296
+
1297
+ # analyze round-trip
1298
+ analysis = analyzer.analyze_structure(smiles, verbose=show_segment_details)
1299
+ three_letter = analysis['three_letter']
1300
+ one_letter = analysis['one_letter']
1301
+ is_cyclic = analysis['is_cyclic']
1302
+ details = analysis.get('details', "")
1303
+
1304
+ img = annotate_cyclic_structure(Chem.MolFromSmiles(smiles), three_letter)
1305
+
1306
+ summary = []
1307
+ summary.append("Peptide → SMILES")
1308
+ summary.append("-" * 50)
1309
+ summary.append(f"Input sequence: {seq_input}")
1310
+ summary.append(f"Generated SMILES:\n{smiles}")
1311
+ summary.append("")
1312
+ summary.append("Round-trip check (SMILES → sequence):")
1313
+ summary.append(f"Sequence: {three_letter}")
1314
+ summary.append(f"One-letter code: {one_letter}")
1315
+ summary.append(f"Is Cyclic: {'Yes' if is_cyclic else 'No'}")
1316
+
1317
+ if show_segment_details and details:
1318
+ summary.append("\n" + "="*50)
1319
+ summary.append("SEGMENT ANALYSIS")
1320
+ summary.append("="*50)
1321
+ summary.append(details)
1322
+
1323
+ # UAA report
1324
+ detected_uaas = [aa for aa in analysis['residues'] if aa not in [
1325
+ 'Ala', 'Cys', 'Asp', 'Glu', 'Phe', 'Gly', 'His', 'Ile', 'Lys', 'Leu',
1326
+ 'Met', 'Asn', 'Pro', 'Gln', 'Arg', 'Ser', 'Thr', 'Val', 'Trp', 'Tyr',
1327
+ 'ala', 'cys', 'asp', 'glu', 'phe', 'gly', 'his', 'ile', 'lys', 'leu',
1328
+ 'met', 'asn', 'pro', 'gln', 'arg', 'ser', 'thr', 'val', 'trp', 'tyr'
1329
+ ]]
1330
+ if detected_uaas:
1331
+ summary.append(f"\nDetected UAAs (round-trip): {', '.join(sorted(set(detected_uaas)))}")
1332
+
1333
+ return "\n".join(summary), img, smiles
1334
+
1335
+ except Exception as e:
1336
+ return f"Error: {str(e)}", None, None
1337
+
1338
+ with gr.Blocks(title="Peptide Structure Analyzer and Visualizer") as demo:
1339
+ gr.Markdown("# Peptide Structure Analyzer and Visualizer")
1340
+
1341
+ # 👇 place your original multi-line description right here
1342
+ gr.Markdown("""
1343
+ Analyze and visualize peptide structures from SMILES notation:
1344
+ 1. Validates if the input is a peptide structure
1345
+ 2. Determines if the peptide is cyclic
1346
+ 3. Parses the amino acid sequence
1347
+ 4. Creates 2D structure visualization with residue annotations
1348
+
1349
+ Input: Either enter a SMILES string directly or upload a text file containing SMILES strings
1350
+
1351
+ Example SMILES strings (copy and paste):
1352
+ ```
1353
+ CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@@H](C)N(C)C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H]2CCCN2C1=O
1354
+ ```
1355
+ ```
1356
+ C(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O
1357
+ ```
1358
+ ```
1359
+ CC(C)C[C@H]1C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)NCC(=O)N[C@H](C(=O)N2CCCCC2)CC(=O)N(C)CC(=O)N[C@@H]([C@@H](C)O)C(=O)N(C)[C@@H](C)C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N1C
1360
+ ```
1361
+ Example Peptide strings (copy and paste):
1362
+ ```
1363
+ AGFS
1364
+ ```
1365
+ ```
1366
+ Ala-Gly-Phe-Ser
1367
+ ```
1368
+ ```
1369
+ Aib-Dtg-Ser
1370
+ ```
1371
+ """)
1372
+ with gr.Tab("SMILES → Sequence"):
1373
+ gr.Markdown("Analyze peptide SMILES, detect cyclicity, parse sequence, and annotate.")
1374
+ smiles_in = gr.Textbox(label="Enter SMILES string", lines=2, placeholder="Enter SMILES notation of peptide...")
1375
+ file_in = gr.File(label="Or upload a text file with SMILES", file_types=[".txt"])
1376
+ show_seg = gr.Checkbox(label="Show segmentation details", value=False)
1377
+ run_btn_1 = gr.Button("Analyze")
1378
+ out_text_1 = gr.Textbox(label="Analysis Results", lines=12)
1379
+ out_img_1 = gr.Image(label="2D Structure with Annotations", type="pil")
1380
+ out_md_1 = gr.Markdown(label="Side Notes for Non-Standard Amino Acids")
1381
+
1382
+ def _run_smiles(s_in, f_in, sh):
1383
+ return process_input(
1384
+ smiles_input=s_in,
1385
+ file_obj=f_in,
1386
+ show_segment_details=sh,
1387
+ generate_3d=False,
1388
+ use_uff=False
1389
+ )
1390
+
1391
+ run_btn_1.click(
1392
+ _run_smiles,
1393
+ inputs=[smiles_in, file_in, show_seg],
1394
+ outputs=[out_text_1, out_img_1, out_md_1]
1395
+ )
1396
+
1397
+ with gr.Tab("Peptide → SMILES"):
1398
+ gr.Markdown("Encode a peptide sequence to SMILES (one-letter or three-letter) and verify round-trip.")
1399
+ seq_in = gr.Textbox(
1400
+ label="Enter peptide sequence",
1401
+ lines=2,
1402
+ placeholder="Examples: AGFS | Ala-Gly-Phe-Ser | Ala(N-Me)-Pro-Phe | Aib-Dtg-Ser"
1403
+ )
1404
+ with gr.Row():
1405
+ use_pro = gr.Checkbox(label="Use Proline ring join", value=True)
1406
+ cyc = gr.Checkbox(label="Cyclic (macrocycle)", value=False)
1407
+ show_seg2 = gr.Checkbox(label="Show segmentation details", value=False)
1408
+ run_btn_2 = gr.Button("Encode")
1409
+ out_text_2 = gr.Textbox(label="Results & Round-trip", lines=14)
1410
+ out_img_2 = gr.Image(label="2D Structure with Annotations", type="pil")
1411
+ out_smiles = gr.Textbox(label="Generated SMILES (copyable)", lines=2)
1412
+
1413
+ run_btn_2.click(
1414
+ process_sequence_to_smiles,
1415
+ inputs=[seq_in, show_seg2, use_pro, cyc],
1416
+ outputs=[out_text_2, out_img_2, out_smiles]
1417
+ )
1418
+ if __name__ == "__main__":
1419
+ demo.launch(share=True)
1420
+
1421
+
1422
+ """
1423
  iface = gr.Interface(
1424
  fn=process_input,
1425
  inputs=[
 
1480
  if __name__ == "__main__":
1481
  iface.launch(share=True)
1482
 
1483
+ """
1484
  """
1485
  5. Optional linear representation
1486
  6. Optional 3D structure generation (ETKDG and UFF methods)