Spaces:

ChatterjeeLab
/

SMILES2PEPTIDE

Running

App Files Files Community

yzhang@u.duke.nus.edu commited on Nov 4, 2025

Commit

4868d91

1 Parent(s): 432a60b

add sequence 2 smiles feature

Browse files

Files changed (2) hide show

aminoacid_selective.py +557 -0
app.py +412 -36

aminoacid_selective.py ADDED Viewed

	@@ -0,0 +1,557 @@

+#!/usr/bin/env python
+"""Definitions and properties of amino-acids for p2smi"""
+# Natural Amino-acids:
+specific_aminos = {
+    "Fmoc-Aib-OH": {
+        "Code": "Aib",
+        "Formula": "C28H29NO5",
+        "Letter": "Ŷ",
+        "MolWeight": 221.141578848,
+        "SMILES": "CC(C)(N)C(=O)O",
+        "cterm": "NC(C)(C)C(=O)[*:2]",
+        "disulphide": False,
+        "ester": False,
+        "nterm": "[*:1]NC(C)(C)C(=O)O",
+    },
+    "Fmoc-Asp(OtBu)-(Dmb)Gly-OH": {
+        "Code": "Dtg",
+        "Formula": "C28H29NO5",
+        "Letter": "Ĝ",
+        "MolWeight": 221.141578848,
+        "SMILES": "N[C@@H](CC(=O)OC(C)(C)C)C(=O)N(CC1=C(C=C(C=C1)OC)OC)CC(=O)O",
+        "nterm": "[*:1]N[C@@H](CC(=O)OC(C)(C)C)C(=O)N(CC1=C(C=C(C=C1)OC)OC)CC(=O)O",
+        "cterm": "N[C@@H](CC(=O)OC(C)(C)C)C(=O)N(CC1=C(C=C(C=C1)OC)OC)CC(=O)[*:2]",
+        "disulphide": False,
+        "ester": False,
+    },
+    "Fmoc-Cys(Mmt)-OH": {
+        "Code": "Cmt",
+        "Formula": "C28H29NO5",
+        "Letter": "Ĉ",
+        "MolWeight": 221.141578848,
+        "SMILES": "COC1=CC=C(C=C1)C(C2=CC=CC=C2)(C3=CC=CC=C3)SC[C@@H](N)C(=O)O",
+        "nterm": "N([*:1])[C@@H](CSC(C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=C(OC)C=C3)C(=O)O",
+        "cterm": "N[C@@H](CSC(C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=C(OC)C=C3)C(=O)[*:2]",
+        "disulphide": False,
+        "ester": False,
+    },
+    "Fmoc-Glu(OAll)-OH": {
+        "Code": "Eal",
+        "Formula": "C28H29NO5",
+        "Letter": "Ė",
+        "MolWeight": 221.141578848,
+        "SMILES": "C=CCOC(=O)CC[C@@H](N)C(=O)O",
+        "nterm": "[*:1]N[C@@H](CCC(=O)OCC=C)C(=O)O",
+        "cterm": "N[C@@H](CCC(=O)OCC=C)C(=O)[*:2]",
+        "disulphide": False,
+        "ester": False,
+    },
+    "Fmoc-Lys(palmitoyl-Glu-OtBu)-OH": {
+        "Code": "Kpg",
+        "Formula": "C28H29NO5",
+        "Letter": "Ƙ",
+        "MolWeight": 221.141578848,
+        "SMILES": "N[C@@H](CCCNC(=O)CCC[C@@H](NC(=O)CCCCCCCCCCCCCCCC)C(=O)OC(C)(C)C)C(=O)O",
+        "nterm": "[*:1]N[C@@H](CCCN(C(=O)CCC[C@@H](NC(=O)CCCCCCCCCCCCCCCC)C(=O)OC(C)(C)C))C(=O)O",
+        "cterm": "N[C@@H](CCCN(C(=O)CCC[C@@H](NC(=O)CCCCCCCCCCCCCCCC)C(=O)OC(C)(C)C))C(=O)[*:2]",
+        "disulphide": False,
+        "ester": False,
+    },
+    "Fmoc-Thr(PO(OBzl)OH)-OH": {
+        "Code": "Tpb",
+        "Formula": "C28H29NO5",
+        "Letter": "Ṯ",
+        "MolWeight": 221.141578848,
+        "SMILES": "N[C@@H]([C@H](C)OP(=O)(O)OCc1ccccc1)C(=O)O",
+        "disulphide": False,
+        "ester": False,
+        "nterm": "[*:1]N[C@@H]([C@H](C)OP(=O)(O)OCC1=CC=CC=C1)C(=O)O",
+        "cterm": "N[C@@H]([C@H](C)OP(=O)(O)OCC1=CC=CC=C1)C(=O)[*:2]"
+    },
+    "Fmoc-Cycloleucine": {
+        "Code": "Cyl",
+        "Formula": "C28H29NO5",
+        "Letter": "Ċ",
+        "MolWeight": 221.141578848,
+        "SMILES": "NC1(CCCC1)C(=O)O",
+        "nterm": "[*:1]NC1(CCCC1)C(=O)O",
+        "cterm": "NC1(CCCC1)C(=O)[*:2]",
+        "disulphide": False,
+        "ester": False,
+    },
+    "Fmoc-N-Me-Ala-OH": {
+        "Code": "Nma",
+        "Formula": "C28H29NO5",
+        "Letter": "Ṃ",
+        "MolWeight": 221.141578848,
+        "SMILES": "CN([C@@H](C)C(=O)O)",
+        "cterm": "N(C)[C@@H](C)C(=O)[*:2]",
+        "disulphide": False,
+        "ester": False,
+        "nterm": "[*:1]N(C)[C@@H](C)C(=O)O",
+    },
+    "Fmoc-N-Me-Leu-OH": {
+        "Code": "Nml",
+        "Formula": "C28H29NO5",
+        "Letter": "Ŀ",
+        "MolWeight": 221.141578848,
+        "SMILES": "CN([C@@H](CC(C)C))C(=O)O",
+        "cterm": "CN([C@@H](CC(C)C)C(=O))[*:2]",
+        "disulphide": False,
+        "ester": False,
+        "nterm": "[*:1]N(C)[C@@H](CC(C)C)C(=O)O",
+    },
+    "Fmoc-Nle-OH": {
+        "Code": "Nle",
+        "Formula": "C28H29NO5",
+        "Letter": "Ł",
+        "MolWeight": 221.141578848,
+        "SMILES": "N[C@@H](CCCC)C(=O)O",
+        "nterm": "[*:1]N[C@@H](CCCC)C(=O)O",
+        "cterm": "N[C@@H](CCCC)C(=O)[*:2]",
+        "disulphide": False,
+        "ester": False,
+    },
+    "N-Fmoc-L-homophenylalanine": {
+        "Code": "Hph",
+        "Formula": "C28H29NO5",
+        "Letter": "Ĥ",
+        "MolWeight": 221.141578848,
+        "SMILES": "N[C@@H](CCC1=CC=CC=C1)C(=O)O",
+        "nterm": "[*:1]N[C@@H](CCC1=CC=CC=C1)C(=O)O",
+        "cterm": "N[C@@H](CCC1=CC=CC=C1)C(=O)[*:2]",
+        "disulphide": False,
+        "ester": False,
+    },
+    "Glycine": {
+        "Code": "Gly",
+        "Formula": "C2H5NO2",
+        "Letter": "G",
+        "MolWeight": "75.07",
+        "SMILES": "NCC(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "L-Alanine": {
+        "Code": "Ala",
+        "Formula": "C3H7NO2",
+        "Letter": "A",
+        "MolWeight": "89.09",
+        "SMILES": "N[C@@H](C)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "L-Arginine": {
+        "Code": "Arg",
+        "Formula": "C6H14N4O2",
+        "Letter": "R",
+        "MolWeight": "174.20",
+        "SMILES": "N[C@@H](CCCNC(=N)N)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": "N[C@@H](CCCNC(=N*)N)C(=O)O",
+    },
+    "L-Asparagine": {
+        "Code": "Asn",
+        "Formula": "C4H8N2O3",
+        "Letter": "N",
+        "MolWeight": "132.12",
+        "SMILES": "N[C@@H](CC(=O)N)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": "N[C@@H](CC(=O)N*)C(=O)O",
+    },
+    "L-Aspartic_Acid": {
+        "Code": "Asp",
+        "Formula": "C4H7NO4",
+        "Letter": "D",
+        "MolWeight": "133.10",
+        "SMILES": "N[C@@H](CC(=O)O)C(=O)O",
+        "cterm": "N[C@@H](CC*(=O))C(=O)O",
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "L-Cysteine": {
+        "Code": "Cys",
+        "Formula": "C3H7NO2S",
+        "Letter": "C",
+        "MolWeight": "121.16",
+        "SMILES": "N[C@@H](CS)C(=O)O",
+        "cterm": False,
+        "disulphide": "N[C@@H](CS*)C(=O)O",
+        "ester": False,
+        "nterm": False,
+    },
+    "L-Glutamic_Acid": {
+        "Code": "Glu",
+        "Formula": "C5H9NO4",
+        "Letter": "E",
+        "MolWeight": "147.13",
+        "SMILES": "N[C@@H](CCC(=O)O)C(=O)O",
+        "cterm": "N[C@@H](CCC*(=O))C(=O)O",
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "L-Glutamine": {
+        "Code": "Gln",
+        "Formula": "C5H10N2O3",
+        "Letter": "Q",
+        "MolWeight": "146.15",
+        "SMILES": "N[C@@H](CCC(=O)N)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": "N[C@@H](CCC(=O)N*)C(=O)O",
+    },
+    "L-Histidine": {
+        "Code": "His",
+        "Formula": "C6H9N3O2",
+        "Letter": "H",
+        "MolWeight": "155.16",
+        "SMILES": "N[C@@H](CC1=CNC=N1)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "L-Isoleucine": {
+        "Code": "Ile",
+        "Formula": "C6H13NO2",
+        "Letter": "I",
+        "MolWeight": "131.18",
+        "SMILES": "N[C@@H]([C@H](CC)C)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "L-Leucine": {
+        "Code": "Leu",
+        "Formula": "C6H13NO2",
+        "Letter": "L",
+        "MolWeight": "131.18",
+        "SMILES": "N[C@@H](CC(C)C)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "L-Lysine": {
+        "Code": "Lys",
+        "Formula": "C6H12N2O2",
+        "Letter": "K",
+        "MolWeight": "146.19",
+        "SMILES": "N[C@@H](CCCCN)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": "N[C@@H](CCCCN*)C(=O)O",
+    },
+    "L-Methionine": {
+        "Code": "Met",
+        "Formula": "C5H11NO2S",
+        "Letter": "M",
+        "MolWeight": "149.21",
+        "SMILES": "N[C@@H](CCSC)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "L-Phenylalanine": {
+        "Code": "Phe",
+        "Formula": "C9H11NO2",
+        "Letter": "F",
+        "MolWeight": "165.19",
+        "SMILES": "N[C@@H](Cc1ccccc1)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "L-Proline": {
+        "Code": "Pro",
+        "Formula": "C5H9NO2",
+        "Letter": "P",
+        "MolWeight": "115.13",
+        "SMILES": "N1[C@@H](CCC1)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "L-Serine": {
+        "Code": "Ser",
+        "Formula": "C3H7NO2",
+        "Letter": "S",
+        "MolWeight": "105.09",
+        "SMILES": "N[C@@H](CO)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": "N[C@@H](CO*)C(=O)O",
+        "nterm": False,
+    },
+    "L-Threonine": {
+        "Code": "Thr",
+        "Formula": "C4H9NO3",
+        "Letter": "T",
+        "MolWeight": "119.12",
+        "SMILES": "N[C@@H]([C@H](O)C)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": "N[C@@H]([C@H](O*)C)C(=O)O",
+        "nterm": False,
+    },
+    "L-Tryptophan": {
+        "Code": "Trp",
+        "Formula": "C11H12N2O2",
+        "Letter": "W",
+        "MolWeight": "204.23",
+        "SMILES": "N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "L-Tyrosine": {
+        "Code": "Tyr",
+        "Formula": "C9H11NO3",
+        "Letter": "Y",
+        "MolWeight": "181.19",
+        "SMILES": "N[C@@H](Cc1ccc(O)cc1)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": "N[C@@H](Cc1ccc(O*)cc1)C(=O)O",
+        "nterm": False,
+    },
+    "L-Valine": {
+        "Code": "Val",
+        "Formula": "C5H11NO2",
+        "Letter": "V",
+        "MolWeight": "117.15",
+        "SMILES": "N[C@@H](C(C)C)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "D-Alanine": {
+        "Code": "ala",
+        "Formula": "C3H7NO2",
+        "Letter": "a",
+        "MolWeight": "89.09",
+        "SMILES": "N[C@H](C)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "D-Arginine": {
+        "Code": "arg",
+        "Formula": "C6H14N4O2",
+        "Letter": "r",
+        "MolWeight": "174.20",
+        "SMILES": "N[C@H](CCCNC(=N)N)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": "N[C@H](CCCNC(=N*)N)C(=O)O",
+    },
+    "D-Asparagine": {
+        "Code": "asn",
+        "Formula": "C4H8N2O3",
+        "Letter": "n",
+        "MolWeight": "132.12",
+        "SMILES": "N[C@H](CC(=O)N)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": "N[C@H](CC(=O)N*)C(=O)O",
+    },
+    "D-Aspartic_Acid": {
+        "Code": "asp",
+        "Formula": "C4H7NO4",
+        "Letter": "d",
+        "MolWeight": "133.10",
+        "SMILES": "N[C@H](CC(=O)O)C(=O)O",
+        "cterm": "N[C@H](CC*(=O))C(=O)O",
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "D-Cysteine": {
+        "Code": "cys",
+        "Formula": "C3H7NO2S",
+        "Letter": "c",
+        "MolWeight": "121.16",
+        "SMILES": "N[C@H](CS)C(=O)O",
+        "cterm": False,
+        "disulphide": "N[C@H](CS*)C(=O)O",
+        "ester": False,
+        "nterm": False,
+    },
+    "D-Glutamic_Acid": {
+        "Code": "glu",
+        "Formula": "C5H9NO4",
+        "Letter": "e",
+        "MolWeight": "147.13",
+        "SMILES": "N[C@H](CCC(=O)O)C(=O)O",
+        "cterm": "N[C@H](CCC*(=O))C(=O)O",
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "D-Glutamine": {
+        "Code": "gln",
+        "Formula": "C5H10N2O3",
+        "Letter": "q",
+        "MolWeight": "146.15",
+        "SMILES": "N[C@H](CCC(=O)N)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": "N[C@H](CCC(=O)N*)C(=O)O",
+    },
+    "D-Histidine": {
+        "Code": "his",
+        "Formula": "C6H9N3O2",
+        "Letter": "h",
+        "MolWeight": "155.16",
+        "SMILES": "N[C@H](CC1=CNC=N1)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "D-Isoleucine": {
+        "Code": "ile",
+        "Formula": "C6H13NO2",
+        "Letter": "i",
+        "MolWeight": "131.18",
+        "SMILES": "N[C@H]([C@@H](CC)C)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "D-Leucine": {
+        "Code": "leu",
+        "Formula": "C6H13NO2",
+        "Letter": "l",
+        "MolWeight": "131.18",
+        "SMILES": "N[C@H](CC(C)C)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "D-Lysine": {
+        "Code": "lys",
+        "Formula": "C6H12N2O2",
+        "Letter": "k",
+        "MolWeight": "146.19",
+        "SMILES": "N[C@H](CCCCN)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": "N[C@H](CCCCN*)C(=O)O",
+    },
+    "D-Methionine": {
+        "Code": "met",
+        "Formula": "C5H11NO2S",
+        "Letter": "m",
+        "MolWeight": "149.21",
+        "SMILES": "N[C@H](CCSC)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "D-Phenylalanine": {
+        "Code": "phe",
+        "Formula": "C9H11NO2",
+        "Letter": "f",
+        "MolWeight": "165.19",
+        "SMILES": "N[C@H](Cc1ccccc1)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "D-Proline": {
+        "Code": "pro",
+        "Formula": "C5H9NO2",
+        "Letter": "p",
+        "MolWeight": "115.13",
+        "SMILES": "N1[C@H](CCC1)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "D-Serine": {
+        "Code": "ser",
+        "Formula": "C3H7NO2",
+        "Letter": "s",
+        "MolWeight": "105.09",
+        "SMILES": "N[C@H](CO)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": "N[C@H](CO*)C(=O)O",
+        "nterm": False,
+    },
+    "D-Tryptophan": {
+        "Code": "trp",
+        "Formula": "C11H12N2O2",
+        "Letter": "w",
+        "MolWeight": "204.23",
+        "SMILES": "N[C@H](CC(=CN2)C1=C2C=CC=C1)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "D-Tyrosine": {
+        "Code": "tyr",
+        "Formula": "C9H11NO3",
+        "Letter": "y",
+        "MolWeight": "181.19",
+        "SMILES": "N[C@H](Cc1ccc(O)cc1)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": "N[C@H](Cc1ccc(O*)cc1)C(=O)O",
+        "nterm": False,
+    },
+    "D-Valine": {
+        "Code": "val",
+        "Formula": "C5H11NO2",
+        "Letter": "v",
+        "MolWeight": "117.15",
+        "SMILES": "N[C@H](C(C)C)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": False,
+        "nterm": False,
+    },
+    "D-Threonine": {
+        "Code": "thr",
+        "Formula": "C4H9NO3",
+        "Letter": "t",
+        "MolWeight": "119.12",
+        "SMILES": "N[C@H]([C@@H](O)C)C(=O)O",
+        "cterm": False,
+        "disulphide": False,
+        "ester": "N[C@H]([C@@H](O*)C)C(=O)O",
+        "nterm": False,
+    },
+}

app.py CHANGED Viewed

@@ -24,6 +24,39 @@ from io import BytesIO
 import tempfile
 from rdkit import Chem
 from swisssidechain import all_aminos
 class PeptideAnalyzer:
     def __init__(self):
@@ -71,27 +104,40 @@ class PeptideAnalyzer:
         self._build_swisssidechain_lookups()
     def _build_swisssidechain_lookups(self):
-        """Side chain lookups for SwissSidechain UAAs"""
-        # Exact SMILES match
         self.exact_smiles_lookup = {}
-        # Clean SMILES lookup (without stereochemistry)
         self.clean_smiles_lookup = {}
         for uaa_name, uaa_data in all_aminos.items():
-            code = uaa_data["Code"]
-            letter = uaa_data["Letter"]
-            smiles = uaa_data["SMILES"]
-            self.three_to_one[code] = letter
-            self.exact_smiles_lookup[smiles] = code
-            # Clean SMILES (no stereochemistry)
-            clean_smiles = self._remove_stereochemistry(smiles)
-            if clean_smiles not in self.clean_smiles_lookup:
-                self.clean_smiles_lookup[clean_smiles] = []
-            self.clean_smiles_lookup[clean_smiles].append(code)
     def _remove_stereochemistry(self, smiles):
         """Remove stereochemistry from SMILES"""
@@ -106,52 +152,61 @@ class PeptideAnalyzer:
         return cleaned
     def preprocess_complex_residues(self, smiles):
-        """Identify and protect complex residues with internal peptide bonds - improved to prevent overlaps"""
         complex_positions = []
         for pattern, residue_type in self.complex_residue_patterns:
             for match in re.finditer(pattern, smiles):
                 if not any(pos['start'] <= match.start() < pos['end'] or
-                          pos['start'] < match.end() <= pos['end'] for pos in complex_positions):
                     complex_positions.append({
                         'start': match.start(),
                         'end': match.end(),
                         'type': residue_type,
                         'pattern': match.group()
                     })
         complex_positions.sort(key=lambda x: x['start'])
         if not complex_positions:
             return smiles, []
         preprocessed_smiles = smiles
         offset = 0
         protected_residues = []
         for pos in complex_positions:
             start = pos['start'] + offset
-            end = pos['end'] + offset
             complex_part = preprocessed_smiles[start:end]
             if not ('[C@H]' in complex_part or '[C@@H]' in complex_part):
-                continue
             placeholder = f"COMPLEX_RESIDUE_{len(protected_residues)}"
             preprocessed_smiles = preprocessed_smiles[:start] + placeholder + preprocessed_smiles[end:]
             offset += len(placeholder) - (end - start)
             protected_residues.append({
                 'placeholder': placeholder,
                 'type': pos['type'],
                 'content': complex_part
             })
         return preprocessed_smiles, protected_residues
     def split_on_bonds(self, smiles, protected_residues=None):
         """Split SMILES into segments based on peptide bonds, with improved handling of protected residues"""
         positions = []
@@ -310,7 +365,11 @@ class PeptideAnalyzer:
     def identify_residue(self, segment):
         if 'complex_type' in segment:
             return segment['complex_type'], []
         content = self.clean_terminal_carboxyl(segment)
         mods = self.get_modifications(segment)
@@ -901,6 +960,175 @@ class PeptideStructureGenerator:
         return sio.getvalue().encode('utf-8')
 def process_input(
     smiles_input=None,
     file_obj=None,
@@ -1045,6 +1273,153 @@ def process_input(
             #structure_files if structure_files else []
         )
 iface = gr.Interface(
     fn=process_input,
     inputs=[
@@ -1105,6 +1480,7 @@ iface = gr.Interface(
 if __name__ == "__main__":
     iface.launch(share=True)
 """
 5. Optional linear representation
 6. Optional 3D structure generation (ETKDG and UFF methods)

 import tempfile
 from rdkit import Chem
 from swisssidechain import all_aminos
+from aminoacid_selective import specific_aminos
+def _internal_from_cterm(cterm: str) -> str:
+    s = cterm.strip()
+    s = re.sub(r'C\(=O\)\[\*:\s*2\]\s*$', '', s)  # drop trailing carbonyl anchor
+    s = re.sub(r'^\[\*:\s*1\]', '', s)            # drop leading anchor
+    s = re.sub(r'^\(?N\)?', '', s)                # drop leading N
+    return s
+def _internal_from_nterm(nterm: str) -> str:
+    s = nterm.strip()
+    s = re.sub(r'^\[\*:\s*1\]', '', s)            # drop leading anchor
+    s = re.sub(r'^\(?N\)?', '', s)                # drop leading N
+    s = re.sub(r'C\(=O\)O\s*$', '', s)            # drop trailing COOH
+    return s
+def _chirality_agnostic_regex(literal_smiles: str) -> re.Pattern:
+    """
+    Make a regex that matches the literal SMILES but ignores stereo/ring digit specifics.
+    - Escapes all chars
+    - Makes '@' optional (so [C@@H] / [C@H] / [CH] all match)
+    - Allows any ring digit where a digit appears
+    """
+    esc = re.escape(literal_smiles)
+    # make any '@' optional (two steps to handle @@)
+    esc = esc.replace(r'\@\@', r'\@?\@?')
+    esc = esc.replace(r'\@', r'\@?')
+    # allow any ring digit(s) where digits appear
+    esc = re.sub(r'\\\d+', r'\\d+', esc)
+    return re.compile(esc)
 class PeptideAnalyzer:
     def __init__(self):
         self._build_swisssidechain_lookups()
     def _build_swisssidechain_lookups(self):
         self.exact_smiles_lookup = {}
         self.clean_smiles_lookup = {}
+        self.uaa_internal_exact = {}
+        self.uaa_internal_patterns = []
         for uaa_name, uaa_data in all_aminos.items():
+            code   = uaa_data["Code"]
+            smiles = uaa_data.get("SMILES", "")
+            nterm  = uaa_data.get("nterm", "")
+            cterm  = uaa_data.get("cterm", "")
+            letter = uaa_data.get("Letter")
+            # keep existing full-aa lookups
+            if smiles:
+                self.exact_smiles_lookup[smiles] = code
+                clean = self._remove_stereochemistry(smiles)
+                self.clean_smiles_lookup.setdefault(clean, []).append(code)
+            internal = ""
+            if cterm:
+                internal = _internal_from_cterm(cterm)
+            elif nterm:
+                internal = _internal_from_nterm(nterm)
+            if internal:
+                self.exact_smiles_lookup[internal] = code
+                clean_int = self._remove_stereochemistry(internal)
+                self.clean_smiles_lookup.setdefault(clean_int, []).append(code)
+                self.uaa_internal_exact[code] = internal
+                self.uaa_internal_patterns.append((_chirality_agnostic_regex(internal), code))
+            if letter:
+                self.three_to_one[code] = letter
     def _remove_stereochemistry(self, smiles):
         """Remove stereochemistry from SMILES"""
         return cleaned
     def preprocess_complex_residues(self, smiles):
         complex_positions = []
         for pattern, residue_type in self.complex_residue_patterns:
             for match in re.finditer(pattern, smiles):
                 if not any(pos['start'] <= match.start() < pos['end'] or
+                           pos['start'] < match.end() <= pos['end'] for pos in complex_positions):
                     complex_positions.append({
                         'start': match.start(),
                         'end': match.end(),
                         'type': residue_type,
                         'pattern': match.group()
                     })
+        for rgx, code in getattr(self, 'uaa_internal_patterns', []):
+            for match in rgx.finditer(smiles):
+                if not any(pos['start'] <= match.start() < pos['end'] or
+                           pos['start'] < match.end() <= pos['end'] for pos in complex_positions):
+                    complex_positions.append({
+                        'start': match.start(),
+                        'end': match.end(),
+                        'type': code,          # e.g., 'Dtg'
+                        'pattern': match.group()
+                    })
         complex_positions.sort(key=lambda x: x['start'])
         if not complex_positions:
             return smiles, []
         preprocessed_smiles = smiles
         offset = 0
         protected_residues = []
         for pos in complex_positions:
             start = pos['start'] + offset
+            end   = pos['end'] + offset
             complex_part = preprocessed_smiles[start:end]
+            # keep your stereo sanity check (OK to keep)
             if not ('[C@H]' in complex_part or '[C@@H]' in complex_part):
+                # Dtg internal often *does* have [C@@H], so it will pass.
+                # If you find UAAs without explicit stereo, you may relax this guard.
+                pass
             placeholder = f"COMPLEX_RESIDUE_{len(protected_residues)}"
             preprocessed_smiles = preprocessed_smiles[:start] + placeholder + preprocessed_smiles[end:]
             offset += len(placeholder) - (end - start)
             protected_residues.append({
                 'placeholder': placeholder,
                 'type': pos['type'],
                 'content': complex_part
             })
         return preprocessed_smiles, protected_residues
     def split_on_bonds(self, smiles, protected_residues=None):
         """Split SMILES into segments based on peptide bonds, with improved handling of protected residues"""
         positions = []
     def identify_residue(self, segment):
         if 'complex_type' in segment:
             return segment['complex_type'], []
+        # If this was protected by dynamic UAA shielding
+        if segment.get('complex_type') in self.uaa_internal_exact:
+            return segment['complex_type'], []
         content = self.clean_terminal_carboxyl(segment)
         mods = self.get_modifications(segment)
         return sio.getvalue().encode('utf-8')
+class PeptideEncoder:
+    # map one-letter <-> three-letter
+    one_to_three = {
+        'A':'Ala','C':'Cys','D':'Asp','E':'Glu','F':'Phe','G':'Gly','H':'His','I':'Ile',
+        'K':'Lys','L':'Leu','M':'Met','N':'Asn','P':'Pro','Q':'Gln','R':'Arg','S':'Ser',
+        'T':'Thr','V':'Val','W':'Trp','Y':'Tyr',
+        'a':'ala','c':'cys','d':'asp','e':'glu','f':'phe','g':'gly','h':'his','i':'ile',
+        'k':'lys','l':'leu','m':'met','n':'asn','p':'pro','q':'gln','r':'arg','s':'ser',
+        't':'thr','v':'val','w':'trp','y':'tyr'
+    }
+    # L-form uses [C@@H], D-form uses [C@H].
+    SEG_L = {
+        'Ala': '[C@@H](C)',
+        'Gly': 'C',  # your analyzer treats bare 'C' (or 'NC') as Gly in context
+        'Val': '[C@@H](C(C)C)',
+        'Leu': '[C@@H](CC(C)C)',
+        'Ile': '[C@@H]([C@H](C)CC)',
+        'Ser': '[C@@H](CO)',
+        'Thr': '[C@@H]([C@@H](C)O)',
+        'Cys': '[C@@H](CS)',
+        'Met': '[C@@H](CCSC)',
+        'Phe': '[C@@H](Cc1ccccc1)',
+        'Tyr': '[C@@H](Cc1ccc(O)cc1)',
+        'Trp': '[C@@H](Cc1c[nH]c2ccccc12)',
+        'His': '[C@@H](Cc1c[nH]cn1)',
+        'Asp': '[C@@H](CC(=O)O)',
+        'Glu': '[C@@H](CCC(=O)O)',
+        'Asn': '[C@@H](CC(=O)N)',
+        'Gln': '[C@@H](CCC(=O)N)',
+        'Lys': '[C@@H](CCCCN)',
+        'Arg': '[C@@H](CCCNC(=N)N)',
+        'Pro': 'CC[C@H]2CN2'  # only used if not doing ring-number closure
+    }
+    # D-forms: flip chirality tag to [C@H]
+    SEG_D = {k.lower(): v.replace('[C@@H]', '[C@H]').replace('[C@H]2','[C@@H]2') for k, v in SEG_L.items()}
+    UAA_SEG = {
+        'Aib': 'C(C)(C)',          # alpha,alpha-dimethyl gly (detected as Aib when bracketed by peptide bonds)
+        'Nle': '[C@@H](CCCC)',     # norleucine ~ Lys w/o terminal amine
+        'Hph': '[C@@H](CCc1ccccc1)',  # homophenylalanine
+        'Cyl': 'C1(CCCC1)',        # cycloleucine
+    }
+    def __init__(self):
+        self.ssc_code_to_internal = {}
+        for name, data in specific_aminos.items():
+            code  = data["Code"]
+            cterm = data.get("cterm", "")
+            nterm = data.get("nterm", "")
+            internal = ""
+            if cterm:
+                internal = _internal_from_cterm(cterm)
+            elif nterm:
+                internal = _internal_from_nterm(nterm)
+            if internal:
+                self.ssc_code_to_internal[code] = internal
+        for name, data in all_aminos.items():
+            code  = data["Code"]
+            cterm = data.get("cterm", "")
+            nterm = data.get("nterm", "")
+            internal = ""
+            if cterm:
+                internal = _internal_from_cterm(cterm)
+            elif nterm:
+                internal = _internal_from_nterm(nterm)
+            if internal:
+                self.ssc_code_to_internal[code] = internal
+    def _segment_for(self, code):
+        if code in self.SEG_L: return self.SEG_L[code]
+        if code in self.SEG_D: return self.SEG_D[code]
+        if code in self.UAA_SEG: return self.UAA_SEG[code]
+        if code in self.ssc_code_to_internal:
+            return self.ssc_code_to_internal[code]
+        cap = code[:1].upper() + code[1:].lower()
+        if cap in self.SEG_L: return self.SEG_L[cap]
+        raise ValueError(f"Unknown residue code: {code}")
+    def _is_one_letter_seq(self, seq: str) -> bool:
+        """Check if the input string looks like a one-letter code sequence."""
+        if "-" not in seq:
+            return True
+    def _norm_token(self, tok):
+        """Normalize tokens like 'A', 'a', 'Ala', 'ala', 'Ala(N-Me)' -> (code, n_me_flag)"""
+        n_me = False
+        tok = tok.strip()
+        if tok in self.one_to_three:
+            base = self.one_to_three[tok]
+        else:
+            m = re.match(r'^([A-Za-z\-]+)(\((.*?)\))?$', tok)
+            if not m:
+                return tok, n_me
+            base = m.group(1)
+            mods = m.group(3) or ""
+            if 'N-Me' in mods or 'Nme' in mods or 'NME' in mods:
+                n_me = True
+        return base, n_me
+    def _bond_for(self, n_me=False, pro_ring=False, ring_idx=1):
+        """Return the INTER-RESIDUE bond token your parser recognizes."""
+        if pro_ring:
+            return f'C(=O)N{ring_idx}'
+        return 'N(C)C(=O)' if n_me else 'NC(=O)'
+    def _split_tokens(self, seq):
+        if isinstance(seq, (list, tuple)):
+            return list(seq)
+        seq = seq.strip()
+        if self._is_one_letter_seq(seq):
+            return list(seq)
+        import re
+        return [t for t in re.split(r'-(?![^()]*\))', seq) if t]
+    def encode(self, seq, cyclic=False, use_proline_ring=True):
+        """
+        Encode a peptide to a SMILES string using the same grammar your analyzer expects.
+        Args:
+          seq: list of tokens or a string like:
+               'Ala-Gly-Phe', 'A-G-F', 'Ala(N-Me)-Leu-Ser', 'Aib-Nle-Arg'
+               D-forms: 'ala-gly', or 'a-g'
+          cyclic: if True, connect C-terminus back to N-terminus (macrocycle)
+          use_proline_ring: if True, do ring-number closure for Pro (N{digit} ... [C@H]{digit})
+        """
+        toks = self._split_tokens(seq)
+        res, mods = [], []
+        for t in toks:
+            base, n_me = self._norm_token(t)  # your existing parser for "(N-Me)"
+            res.append(base)
+            mods.append(n_me)
+        # Build segments
+        segs = [self._segment_for(r) for r in res]
+        # Proline ring bookkeeping
+        # We only do the special N{digit}...{digit} closure when a bond *into* Pro occurs.
+        bonds = []
+        for i in range(len(segs)-1):
+            next_is_pro = res[i+1] in ('Pro','pro')
+            if use_proline_ring and next_is_pro:
+                bonds.append(self._bond_for(n_me=mods[i], pro_ring=True, ring_idx=1))
+                # Make the Pro segment end with the matching ring digit
+                segs[i+1] = 'CCC[C@H]1'  if res[i+1]=='Pro' else 'CCC[C@@H]1'
+            else:
+                bonds.append(self._bond_for(n_me=mods[i], pro_ring=False))
+        # Assemble linear chain
+        # [segment0] + bond0 + [segment1] + bond1 + ... + [segmentN-1] + C(=O)O
+        out = []
+        for i, s in enumerate(segs):
+            out.append(s)
+            if i < len(bonds):
+                out.append(bonds[i])
+        if cyclic:
+            # TODO
+            pass
+        else:
+            out.append('C(=O)O')
+        return ''.join(out)
 def process_input(
     smiles_input=None,
     file_obj=None,
             #structure_files if structure_files else []
         )
+def process_sequence_to_smiles(
+    seq_input: str,
+    show_segment_details: bool = False,
+    use_proline_ring: bool = True,
+    cyclic: bool = False
+):
+    """
+    Encode a peptide sequence to SMILES, then analyze back with PeptideAnalyzer for round-trip.
+    """
+    if not seq_input or not seq_input.strip():
+        return "Please enter a peptide sequence.", None, None
+    try:
+        enc = PeptideEncoder()  # make sure this class is defined in your file
+        smiles = enc.encode(seq_input.strip(), cyclic=cyclic, use_proline_ring=use_proline_ring)
+        analyzer = PeptideAnalyzer()
+        # pre-check it's a peptide
+        if not analyzer.is_peptide(smiles):
+            return "Internal error: generated SMILES did not look like a peptide.", None, None
+        # analyze round-trip
+        analysis = analyzer.analyze_structure(smiles, verbose=show_segment_details)
+        three_letter = analysis['three_letter']
+        one_letter = analysis['one_letter']
+        is_cyclic = analysis['is_cyclic']
+        details = analysis.get('details', "")
+        img = annotate_cyclic_structure(Chem.MolFromSmiles(smiles), three_letter)
+        summary = []
+        summary.append("Peptide → SMILES")
+        summary.append("-" * 50)
+        summary.append(f"Input sequence: {seq_input}")
+        summary.append(f"Generated SMILES:\n{smiles}")
+        summary.append("")
+        summary.append("Round-trip check (SMILES → sequence):")
+        summary.append(f"Sequence: {three_letter}")
+        summary.append(f"One-letter code: {one_letter}")
+        summary.append(f"Is Cyclic: {'Yes' if is_cyclic else 'No'}")
+        if show_segment_details and details:
+            summary.append("\n" + "="*50)
+            summary.append("SEGMENT ANALYSIS")
+            summary.append("="*50)
+            summary.append(details)
+        # UAA report
+        detected_uaas = [aa for aa in analysis['residues'] if aa not in [
+            'Ala', 'Cys', 'Asp', 'Glu', 'Phe', 'Gly', 'His', 'Ile', 'Lys', 'Leu',
+            'Met', 'Asn', 'Pro', 'Gln', 'Arg', 'Ser', 'Thr', 'Val', 'Trp', 'Tyr',
+            'ala', 'cys', 'asp', 'glu', 'phe', 'gly', 'his', 'ile', 'lys', 'leu',
+            'met', 'asn', 'pro', 'gln', 'arg', 'ser', 'thr', 'val', 'trp', 'tyr'
+        ]]
+        if detected_uaas:
+            summary.append(f"\nDetected UAAs (round-trip): {', '.join(sorted(set(detected_uaas)))}")
+        return "\n".join(summary), img, smiles
+    except Exception as e:
+        return f"Error: {str(e)}", None, None
+with gr.Blocks(title="Peptide Structure Analyzer and Visualizer") as demo:
+    gr.Markdown("# Peptide Structure Analyzer and Visualizer")
+    # 👇 place your original multi-line description right here
+    gr.Markdown("""
+    Analyze and visualize peptide structures from SMILES notation:
+    1. Validates if the input is a peptide structure
+    2. Determines if the peptide is cyclic
+    3. Parses the amino acid sequence
+    4. Creates 2D structure visualization with residue annotations
+    Input: Either enter a SMILES string directly or upload a text file containing SMILES strings
+    Example SMILES strings (copy and paste):
+    ```
+    CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@@H](C)N(C)C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H]2CCCN2C1=O
+    ```
+    ```
+    C(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O
+    ```
+    ```
+    CC(C)C[C@H]1C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)NCC(=O)N[C@H](C(=O)N2CCCCC2)CC(=O)N(C)CC(=O)N[C@@H]([C@@H](C)O)C(=O)N(C)[C@@H](C)C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N1C
+    ```
+    Example Peptide strings (copy and paste):
+    ```
+    AGFS
+    ```
+    ```
+    Ala-Gly-Phe-Ser
+    ```
+    ```
+    Aib-Dtg-Ser
+    ```
+    """)
+    with gr.Tab("SMILES → Sequence"):
+        gr.Markdown("Analyze peptide SMILES, detect cyclicity, parse sequence, and annotate.")
+        smiles_in = gr.Textbox(label="Enter SMILES string", lines=2, placeholder="Enter SMILES notation of peptide...")
+        file_in   = gr.File(label="Or upload a text file with SMILES", file_types=[".txt"])
+        show_seg  = gr.Checkbox(label="Show segmentation details", value=False)
+        run_btn_1 = gr.Button("Analyze")
+        out_text_1 = gr.Textbox(label="Analysis Results", lines=12)
+        out_img_1  = gr.Image(label="2D Structure with Annotations", type="pil")
+        out_md_1   = gr.Markdown(label="Side Notes for Non-Standard Amino Acids")
+        def _run_smiles(s_in, f_in, sh):
+            return process_input(
+                smiles_input=s_in,
+                file_obj=f_in,
+                show_segment_details=sh,
+                generate_3d=False,
+                use_uff=False
+            )
+        run_btn_1.click(
+            _run_smiles,
+            inputs=[smiles_in, file_in, show_seg],
+            outputs=[out_text_1, out_img_1, out_md_1]
+        )
+    with gr.Tab("Peptide → SMILES"):
+        gr.Markdown("Encode a peptide sequence to SMILES (one-letter or three-letter) and verify round-trip.")
+        seq_in = gr.Textbox(
+            label="Enter peptide sequence",
+            lines=2,
+            placeholder="Examples: AGFS  |  Ala-Gly-Phe-Ser  |  Ala(N-Me)-Pro-Phe  |  Aib-Dtg-Ser"
+        )
+        with gr.Row():
+            use_pro = gr.Checkbox(label="Use Proline ring join", value=True)
+            cyc     = gr.Checkbox(label="Cyclic (macrocycle)", value=False)
+        show_seg2 = gr.Checkbox(label="Show segmentation details", value=False)
+        run_btn_2 = gr.Button("Encode")
+        out_text_2 = gr.Textbox(label="Results & Round-trip", lines=14)
+        out_img_2  = gr.Image(label="2D Structure with Annotations", type="pil")
+        out_smiles = gr.Textbox(label="Generated SMILES (copyable)", lines=2)
+        run_btn_2.click(
+            process_sequence_to_smiles,
+            inputs=[seq_in, show_seg2, use_pro, cyc],
+            outputs=[out_text_2, out_img_2, out_smiles]
+        )
+if __name__ == "__main__":
+    demo.launch(share=True)
+"""
 iface = gr.Interface(
     fn=process_input,
     inputs=[
 if __name__ == "__main__":
     iface.launch(share=True)
+"""
 """
 5. Optional linear representation
 6. Optional 3D structure generation (ETKDG and UFF methods)