Spaces:
Running
Running
Commit
·
a953180
1
Parent(s):
418afab
class format
Browse files
app.py
CHANGED
|
@@ -11,257 +11,294 @@ import matplotlib.pyplot as plt
|
|
| 11 |
import matplotlib.patches as patches
|
| 12 |
from io import BytesIO
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
return True
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
def remove_nested_branches(smiles):
|
| 38 |
-
"""Remove nested branches from SMILES string"""
|
| 39 |
-
result = ''
|
| 40 |
-
depth = 0
|
| 41 |
-
for char in smiles:
|
| 42 |
-
if char == '(':
|
| 43 |
-
depth += 1
|
| 44 |
-
elif char == ')':
|
| 45 |
-
depth -= 1
|
| 46 |
-
elif depth == 0:
|
| 47 |
-
result += char
|
| 48 |
-
return result
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
modifications = []
|
| 68 |
-
# Check for N-methylation
|
| 69 |
-
if 'N(C)' in segment: # Changed to look in current segment
|
| 70 |
-
modifications.append('N-Me')
|
| 71 |
-
if next_segment and 'OC(=O)' in next_segment:
|
| 72 |
-
modifications.append('O-linked')
|
| 73 |
-
|
| 74 |
-
# Check for Proline - but not if it's actually Cha
|
| 75 |
-
if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
|
| 76 |
-
if not 'CCCCC' in segment: # Make sure it's not Cha
|
| 77 |
-
return ('Pro', modifications)
|
| 78 |
-
|
| 79 |
-
# Check if this segment is part of a Proline ring by looking at context
|
| 80 |
-
if prev_segment and next_segment:
|
| 81 |
-
if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
|
| 82 |
-
combined = prev_segment + segment + next_segment
|
| 83 |
-
if re.search(r'CCCN.*C\(=O\)', combined) and not 'CCCCC' in combined:
|
| 84 |
-
return ('Pro', modifications)
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
return ('Trp', modifications)
|
| 101 |
-
if 'c1cnc[nH]1' in segment:
|
| 102 |
-
return ('His', modifications)
|
| 103 |
-
|
| 104 |
-
# Branched chain amino acids
|
| 105 |
-
if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:
|
| 106 |
-
return ('Leu', modifications)
|
| 107 |
-
if '[C@H](CC(C)C)' in segment or '[C@@H](CC(C)C)' in segment:
|
| 108 |
-
return ('Leu', modifications)
|
| 109 |
-
if 'C(C)C' in segment and not any(pat in segment for pat in ['CC(C)C', 'C(C)C[C@H]', 'C(C)C[C@@H]']):
|
| 110 |
-
return ('Val', modifications)
|
| 111 |
-
if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:
|
| 112 |
-
return ('Ile', modifications)
|
| 113 |
-
|
| 114 |
-
# Small/polar amino acids - make Ala check more specific
|
| 115 |
-
if '[C@H](CO)' in segment:
|
| 116 |
-
return ('Ser', modifications)
|
| 117 |
-
if '[C@@H]([C@@H](C)O)' in segment or '[C@H]([C@H](C)O)' in segment:
|
| 118 |
-
return ('Thr', modifications)
|
| 119 |
-
if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
|
| 120 |
-
return ('Gly', modifications)
|
| 121 |
-
if ('[C@@H](C)' in segment or '[C@H](C)' in segment) and \
|
| 122 |
-
not any(pat in segment for pat in ['O', 'CC(C)', 'COC']):
|
| 123 |
-
return ('Ala', modifications)
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
-
def
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
if
|
| 161 |
-
|
| 162 |
-
sequence.append(f"{residue}({','.join(mods)})")
|
| 163 |
-
else:
|
| 164 |
-
sequence.append(residue)
|
| 165 |
-
|
| 166 |
-
print("\nDetailed Analysis:")
|
| 167 |
-
print("Segments:", segments)
|
| 168 |
-
print("Found sequence:", sequence)
|
| 169 |
-
|
| 170 |
-
if is_cyclic_peptide(smiles):
|
| 171 |
-
return f"cyclo({'-'.join(sequence)})"
|
| 172 |
-
return '-'.join(sequence)
|
| 173 |
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
cycle_info = {}
|
| 182 |
-
|
| 183 |
-
# Find all cycle numbers and their contexts
|
| 184 |
-
for match in re.finditer(r'(\d)', smiles):
|
| 185 |
-
number = match.group(1)
|
| 186 |
-
position = match.start(1)
|
| 187 |
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
print("\nCycle Analysis:")
|
| 197 |
-
for num, occurrences in cycle_info.items():
|
| 198 |
-
print(f"Cycle number {num}:")
|
| 199 |
-
for occ in occurrences:
|
| 200 |
-
print(f"Position: {occ['position']}")
|
| 201 |
-
print(f"Context: {occ['full_context']}")
|
| 202 |
-
|
| 203 |
-
# Check each cycle
|
| 204 |
-
peptide_cycles = []
|
| 205 |
-
aromatic_cycles = []
|
| 206 |
-
|
| 207 |
-
for number, occurrences in cycle_info.items():
|
| 208 |
-
if len(occurrences) != 2:
|
| 209 |
-
continue
|
| 210 |
-
|
| 211 |
-
start, end = occurrences[0]['position'], occurrences[1]['position']
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
is_aromatic = ('c2ccccc2' in full_context and len(segment) < 20) or ('c1ccccc1' in full_context and len(segment) < 20)
|
| 219 |
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
'C
|
| 225 |
-
|
| 226 |
-
]
|
| 227 |
|
| 228 |
-
|
| 229 |
-
|
| 230 |
|
| 231 |
-
if
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
print("\nFound cycles:")
|
| 237 |
-
print(f"Peptide cycles: {peptide_cycles}")
|
| 238 |
-
print(f"Aromatic cycles: {aromatic_cycles}")
|
| 239 |
-
|
| 240 |
-
return len(peptide_cycles) > 0
|
| 241 |
|
| 242 |
-
def
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
-
|
| 258 |
-
return
|
| 259 |
-
|
| 260 |
-
'Sequence': f'Error: {str(e)}',
|
| 261 |
-
'Is Cyclic': 'Error',
|
| 262 |
-
#'Peptide Cycles': 'Error',
|
| 263 |
-
#'Aromatic Cycles': 'Error'
|
| 264 |
-
}
|
| 265 |
"""
|
| 266 |
def annotate_cyclic_structure(mol, sequence):
|
| 267 |
'''Create annotated 2D structure with clear, non-overlapping residue labels'''
|
|
@@ -529,16 +566,15 @@ def create_enhanced_linear_viz(sequence, smiles):
|
|
| 529 |
return fig
|
| 530 |
|
| 531 |
def process_input(smiles_input=None, file_obj=None, show_linear=False):
|
| 532 |
-
"""Process input and create visualizations"""
|
| 533 |
-
|
| 534 |
-
images = []
|
| 535 |
|
| 536 |
# Handle direct SMILES input
|
| 537 |
if smiles_input:
|
| 538 |
smiles = smiles_input.strip()
|
| 539 |
|
| 540 |
-
# First check if it's a peptide
|
| 541 |
-
if not is_peptide(smiles):
|
| 542 |
return "Error: Input SMILES does not appear to be a peptide structure.", None, None
|
| 543 |
|
| 544 |
try:
|
|
@@ -547,9 +583,32 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False):
|
|
| 547 |
if mol is None:
|
| 548 |
return "Error: Invalid SMILES notation.", None, None
|
| 549 |
|
| 550 |
-
#
|
| 551 |
-
|
| 552 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
|
| 554 |
# Create cyclic structure visualization
|
| 555 |
img_cyclic = annotate_cyclic_structure(mol, sequence)
|
|
@@ -558,19 +617,21 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False):
|
|
| 558 |
img_linear = None
|
| 559 |
if show_linear:
|
| 560 |
fig_linear = create_enhanced_linear_viz(sequence, smiles)
|
| 561 |
-
|
| 562 |
-
# Convert matplotlib figure to image
|
| 563 |
buf = BytesIO()
|
| 564 |
fig_linear.savefig(buf, format='png', bbox_inches='tight', dpi=300)
|
| 565 |
buf.seek(0)
|
| 566 |
img_linear = Image.open(buf)
|
| 567 |
plt.close(fig_linear)
|
| 568 |
|
| 569 |
-
#
|
| 570 |
-
|
| 571 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
|
| 573 |
-
return output_text, img_cyclic, img_linear
|
| 574 |
|
| 575 |
except Exception as e:
|
| 576 |
return f"Error processing SMILES: {str(e)}", None, None
|
|
@@ -578,31 +639,51 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False):
|
|
| 578 |
# Handle file input
|
| 579 |
if file_obj is not None:
|
| 580 |
try:
|
| 581 |
-
# Handle file content
|
| 582 |
-
if hasattr(file_obj, 'name'):
|
| 583 |
with open(file_obj.name, 'r') as f:
|
| 584 |
content = f.read()
|
| 585 |
-
else:
|
| 586 |
content = file_obj.decode('utf-8') if isinstance(file_obj, bytes) else str(file_obj)
|
| 587 |
|
| 588 |
output_text = ""
|
| 589 |
for line in content.splitlines():
|
| 590 |
smiles = line.strip()
|
| 591 |
if smiles:
|
| 592 |
-
if
|
|
|
|
| 593 |
output_text += f"Skipping non-peptide SMILES: {smiles}\n"
|
| 594 |
continue
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 598 |
output_text += "-" * 50 + "\n"
|
|
|
|
| 599 |
return output_text, None, None
|
| 600 |
|
| 601 |
except Exception as e:
|
| 602 |
return f"Error processing file: {str(e)}", None, None
|
| 603 |
|
| 604 |
return "No input provided.", None, None
|
| 605 |
-
|
| 606 |
# Create Gradio interface with simplified examples
|
| 607 |
iface = gr.Interface(
|
| 608 |
fn=process_input,
|
|
|
|
| 11 |
import matplotlib.patches as patches
|
| 12 |
from io import BytesIO
|
| 13 |
|
| 14 |
+
import re
|
| 15 |
+
from rdkit import Chem
|
| 16 |
+
|
| 17 |
+
class PeptideAnalyzer:
|
| 18 |
+
def __init__(self):
|
| 19 |
+
self.bond_patterns = [
|
| 20 |
+
r'OC\(=O\)', # ester bond
|
| 21 |
+
r'N\(C\)C\(=O\)', # N-methylated peptide bond
|
| 22 |
+
r'N[12]?C\(=O\)', # peptide bond (including Pro N1/N2)
|
| 23 |
+
r'C\(=O\)N\(C\)', # N-methylated peptide bond reverse
|
| 24 |
+
r'C\(=O\)N' # peptide bond reverse
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
def is_peptide(self, smiles):
|
| 28 |
+
"""Check if the SMILES represents a peptide structure"""
|
| 29 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 30 |
+
if mol is None:
|
| 31 |
+
return False
|
| 32 |
+
|
| 33 |
+
# Look for peptide bonds: NC(=O) pattern
|
| 34 |
+
peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)')
|
| 35 |
+
if mol.HasSubstructMatch(peptide_bond_pattern):
|
| 36 |
+
return True
|
| 37 |
+
|
| 38 |
+
# Look for N-methylated peptide bonds: N(C)C(=O) pattern
|
| 39 |
+
n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)')
|
| 40 |
+
if mol.HasSubstructMatch(n_methyl_pattern):
|
| 41 |
+
return True
|
| 42 |
+
|
| 43 |
+
# Look for ester bonds in cyclic depsipeptides: OC(=O) pattern
|
| 44 |
+
ester_bond_pattern = Chem.MolFromSmarts('O[C](=O)')
|
| 45 |
+
if mol.HasSubstructMatch(ester_bond_pattern):
|
| 46 |
+
return True
|
| 47 |
+
|
| 48 |
return False
|
| 49 |
+
|
| 50 |
+
def is_cyclic(self, smiles):
|
| 51 |
+
"""
|
| 52 |
+
Determine if SMILES represents a cyclic peptide
|
| 53 |
+
Returns: (is_cyclic, peptide_cycles, aromatic_cycles)
|
| 54 |
+
"""
|
| 55 |
+
cycle_info = {}
|
| 56 |
|
| 57 |
+
# Find all cycle numbers and their contexts
|
| 58 |
+
for match in re.finditer(r'(\d)', smiles):
|
| 59 |
+
number = match.group(1)
|
| 60 |
+
position = match.start(1)
|
| 61 |
+
|
| 62 |
+
if number not in cycle_info:
|
| 63 |
+
cycle_info[number] = []
|
| 64 |
+
cycle_info[number].append({
|
| 65 |
+
'position': position,
|
| 66 |
+
'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
|
| 67 |
+
})
|
| 68 |
|
| 69 |
+
# Check each cycle
|
| 70 |
+
peptide_cycles = []
|
| 71 |
+
aromatic_cycles = []
|
|
|
|
| 72 |
|
| 73 |
+
for number, occurrences in cycle_info.items():
|
| 74 |
+
if len(occurrences) != 2:
|
| 75 |
+
continue
|
| 76 |
+
|
| 77 |
+
start, end = occurrences[0]['position'], occurrences[1]['position']
|
| 78 |
+
segment = smiles[start:end+1]
|
| 79 |
+
|
| 80 |
+
# Check for aromatic rings
|
| 81 |
+
full_context = smiles[max(0,start-10):min(len(smiles),end+10)]
|
| 82 |
+
is_aromatic = ('c2ccccc2' in full_context and len(segment) < 20) or \
|
| 83 |
+
('c1ccccc1' in full_context and len(segment) < 20)
|
| 84 |
+
|
| 85 |
+
# Check for peptide bonds
|
| 86 |
+
peptide_patterns = [
|
| 87 |
+
'C(=O)N', # Regular peptide bond
|
| 88 |
+
'C(=O)N(C)', # N-methylated peptide bond
|
| 89 |
+
'C(=O)N1', # Cyclic peptide bond
|
| 90 |
+
'C(=O)N2' # Cyclic peptide bond
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
has_peptide_bond = any(pattern in segment for pattern in peptide_patterns) and \
|
| 94 |
+
len(segment) > 20
|
| 95 |
+
|
| 96 |
+
if is_aromatic and len(segment) < 20:
|
| 97 |
+
aromatic_cycles.append(number)
|
| 98 |
+
elif has_peptide_bond:
|
| 99 |
+
peptide_cycles.append(number)
|
| 100 |
|
| 101 |
+
return len(peptide_cycles) > 0, peptide_cycles, aromatic_cycles
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
+
|
| 104 |
+
def split_on_bonds(self, smiles):
|
| 105 |
+
"""Split SMILES into segments with simplified Pro handling"""
|
| 106 |
+
positions = []
|
| 107 |
+
used = set()
|
| 108 |
+
|
| 109 |
+
# Find Gly pattern first
|
| 110 |
+
gly_pattern = r'NCC\(=O\)'
|
| 111 |
+
for match in re.finditer(gly_pattern, smiles):
|
| 112 |
+
if not any(p in range(match.start(), match.end()) for p in used):
|
| 113 |
+
positions.append({
|
| 114 |
+
'start': match.start(),
|
| 115 |
+
'end': match.end(),
|
| 116 |
+
'type': 'gly',
|
| 117 |
+
'pattern': match.group()
|
| 118 |
+
})
|
| 119 |
+
used.update(range(match.start(), match.end()))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
+
# Then find all bonds, including N2C(=O)
|
| 122 |
+
bond_patterns = [
|
| 123 |
+
(r'OC\(=O\)', 'ester'),
|
| 124 |
+
(r'N\(C\)C\(=O\)', 'n_methyl'),
|
| 125 |
+
(r'N[12]C\(=O\)', 'peptide'), # Pro peptide bonds
|
| 126 |
+
(r'NC\(=O\)', 'peptide'), # Regular peptide bonds
|
| 127 |
+
(r'C\(=O\)N\(C\)', 'n_methyl'),
|
| 128 |
+
(r'C\(=O\)N[12]?', 'peptide')
|
| 129 |
+
]
|
| 130 |
|
| 131 |
+
for pattern, bond_type in bond_patterns:
|
| 132 |
+
for match in re.finditer(pattern, smiles):
|
| 133 |
+
if not any(p in range(match.start(), match.end()) for p in used):
|
| 134 |
+
positions.append({
|
| 135 |
+
'start': match.start(),
|
| 136 |
+
'end': match.end(),
|
| 137 |
+
'type': bond_type,
|
| 138 |
+
'pattern': match.group()
|
| 139 |
+
})
|
| 140 |
+
used.update(range(match.start(), match.end()))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
+
# Sort by position
|
| 143 |
+
positions.sort(key=lambda x: x['start'])
|
| 144 |
+
|
| 145 |
+
# Create segments
|
| 146 |
+
segments = []
|
| 147 |
+
|
| 148 |
+
if positions:
|
| 149 |
+
# First segment
|
| 150 |
+
if positions[0]['start'] > 0:
|
| 151 |
+
segments.append({
|
| 152 |
+
'content': smiles[0:positions[0]['start']],
|
| 153 |
+
'bond_after': positions[0]['pattern']
|
| 154 |
+
})
|
| 155 |
+
|
| 156 |
+
# Process segments
|
| 157 |
+
for i in range(len(positions)-1):
|
| 158 |
+
current = positions[i]
|
| 159 |
+
next_pos = positions[i+1]
|
| 160 |
+
|
| 161 |
+
if current['type'] == 'gly':
|
| 162 |
+
segments.append({
|
| 163 |
+
'content': 'NCC(=O)',
|
| 164 |
+
'bond_before': positions[i-1]['pattern'] if i > 0 else None,
|
| 165 |
+
'bond_after': next_pos['pattern']
|
| 166 |
+
})
|
| 167 |
+
else:
|
| 168 |
+
content = smiles[current['end']:next_pos['start']]
|
| 169 |
+
if content:
|
| 170 |
+
segments.append({
|
| 171 |
+
'content': content,
|
| 172 |
+
'bond_before': current['pattern'],
|
| 173 |
+
'bond_after': next_pos['pattern']
|
| 174 |
+
})
|
| 175 |
+
|
| 176 |
+
# Last segment
|
| 177 |
+
if positions[-1]['end'] < len(smiles):
|
| 178 |
+
segments.append({
|
| 179 |
+
'content': smiles[positions[-1]['end']:],
|
| 180 |
+
'bond_before': positions[-1]['pattern']
|
| 181 |
+
})
|
| 182 |
+
|
| 183 |
+
return segments
|
| 184 |
|
| 185 |
+
def identify_residue(self, segment):
|
| 186 |
+
"""Identify residue with Pro reconstruction"""
|
| 187 |
+
content = segment['content']
|
| 188 |
+
mods = self.get_modifications(segment)
|
| 189 |
+
|
| 190 |
+
# Special handling for Pro: reconstruct the complete pattern
|
| 191 |
+
if (segment.get('bond_after') == 'N2C(=O)' and 'CCC' in content) or \
|
| 192 |
+
('CCCN2' in content and content.endswith('=O')): # End case
|
| 193 |
+
# Reconstruct the complete Pro pattern
|
| 194 |
+
if '[C@@H]2' in content or '[C@H]2' in content:
|
| 195 |
+
return 'Pro', mods
|
| 196 |
+
|
| 197 |
+
if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
|
| 198 |
+
return 'Nle', mods
|
| 199 |
+
|
| 200 |
+
# Ornithine (Orn) - 3-carbon chain with NH2
|
| 201 |
+
if ('C[C@H](CCCN)' in content or 'C[C@@H](CCCN)' in content) and 'CC(C)' not in content:
|
| 202 |
+
return 'Orn', mods
|
| 203 |
+
|
| 204 |
+
# 2-Naphthylalanine (2Nal) - distinct from Phe pattern
|
| 205 |
+
if ('Cc3cc2ccccc2c3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
| 206 |
+
return '2Nal', mods
|
| 207 |
+
|
| 208 |
+
# Cyclohexylalanine (Cha) - already in your code but moved here for clarity
|
| 209 |
+
if 'N2CCCCC2' in content or 'CCCCC2' in content:
|
| 210 |
+
return 'Cha', mods
|
| 211 |
+
|
| 212 |
+
# Aminobutyric acid (Abu) - 2-carbon chain
|
| 213 |
+
if ('C[C@H](CC)' in content or 'C[C@@H](CC)' in content) and not any(p in content for p in ['CC(C)', 'CCCC', 'CCC(C)']):
|
| 214 |
+
return 'Abu', mods
|
| 215 |
+
|
| 216 |
+
# Pipecolic acid (Pip) - 6-membered ring like Pro
|
| 217 |
+
if ('N3CCCCC3' in content or 'CCCCC3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
| 218 |
+
return 'Pip', mods
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
+
# Cyclohexylglycine (Chg) - direct cyclohexyl without CH2
|
| 221 |
+
if ('C[C@H](C1CCCCC1)' in content or 'C[C@@H](C1CCCCC1)' in content):
|
| 222 |
+
return 'Chg', mods
|
| 223 |
+
|
| 224 |
+
# 4-Fluorophenylalanine (4F-Phe)
|
| 225 |
+
if ('Cc2ccc(F)cc2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
| 226 |
+
return '4F-Phe', mods
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
+
# Regular residue identification
|
| 229 |
+
if 'NCC(=O)' in content:
|
| 230 |
+
return 'Gly', mods
|
| 231 |
+
|
| 232 |
+
if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content:
|
| 233 |
+
return 'Leu', mods
|
| 234 |
+
if '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content:
|
| 235 |
+
return 'Leu', mods
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
+
if ('C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content) and 'CC(C)C' not in content:
|
| 238 |
+
return 'Ile', mods
|
| 239 |
|
| 240 |
+
if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
|
| 241 |
+
return 'Thr', mods
|
|
|
|
| 242 |
|
| 243 |
+
if '[C@H](Cc2ccccc2)' in content or '[C@@H](Cc2ccccc2)' in content:
|
| 244 |
+
return 'Phe', mods
|
| 245 |
+
|
| 246 |
+
if '[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content:
|
| 247 |
+
if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']):
|
| 248 |
+
return 'Val', mods
|
|
|
|
| 249 |
|
| 250 |
+
if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
|
| 251 |
+
return 'O-tBu', mods
|
| 252 |
|
| 253 |
+
if ('[C@H](C)' in content or '[C@@H](C)' in content):
|
| 254 |
+
if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O']):
|
| 255 |
+
return 'Ala', mods
|
| 256 |
+
|
| 257 |
+
return None, mods
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
+
def get_modifications(self, segment):
|
| 260 |
+
"""Get modifications based on bond types"""
|
| 261 |
+
mods = []
|
| 262 |
+
if segment.get('bond_after'):
|
| 263 |
+
if 'N(C)' in segment['bond_after'] or segment['bond_after'].startswith('C(=O)N(C)'):
|
| 264 |
+
mods.append('N-Me')
|
| 265 |
+
if 'OC(=O)' in segment['bond_after']:
|
| 266 |
+
mods.append('O-linked')
|
| 267 |
+
return mods
|
| 268 |
|
| 269 |
+
def analyze_structure(self, smiles):
|
| 270 |
+
"""Main analysis function"""
|
| 271 |
+
print("\nAnalyzing structure:", smiles)
|
| 272 |
+
|
| 273 |
+
# Split into segments
|
| 274 |
+
segments = self.split_on_bonds(smiles)
|
| 275 |
+
|
| 276 |
+
print("\nSegment Analysis:")
|
| 277 |
+
sequence = []
|
| 278 |
+
for i, segment in enumerate(segments):
|
| 279 |
+
print(f"\nSegment {i}:")
|
| 280 |
+
print(f"Content: {segment['content']}")
|
| 281 |
+
print(f"Bond before: {segment.get('bond_before', 'None')}")
|
| 282 |
+
print(f"Bond after: {segment.get('bond_after', 'None')}")
|
| 283 |
+
|
| 284 |
+
residue, mods = self.identify_residue(segment)
|
| 285 |
+
if residue:
|
| 286 |
+
if mods:
|
| 287 |
+
sequence.append(f"{residue}({','.join(mods)})")
|
| 288 |
+
else:
|
| 289 |
+
sequence.append(residue)
|
| 290 |
+
print(f"Identified as: {residue}")
|
| 291 |
+
print(f"Modifications: {mods}")
|
| 292 |
+
else:
|
| 293 |
+
print(f"Warning: Could not identify residue in segment: {segment['content']}")
|
| 294 |
+
|
| 295 |
+
# Check if cyclic
|
| 296 |
+
is_cyclic = 'N1' in smiles or 'N2' in smiles
|
| 297 |
+
final_sequence = f"cyclo({'-'.join(sequence)})" if is_cyclic else '-'.join(sequence)
|
| 298 |
|
| 299 |
+
print(f"\nFinal sequence: {final_sequence}")
|
| 300 |
+
return final_sequence
|
| 301 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
"""
|
| 303 |
def annotate_cyclic_structure(mol, sequence):
|
| 304 |
'''Create annotated 2D structure with clear, non-overlapping residue labels'''
|
|
|
|
| 566 |
return fig
|
| 567 |
|
| 568 |
def process_input(smiles_input=None, file_obj=None, show_linear=False):
|
| 569 |
+
"""Process input and create visualizations using PeptideAnalyzer"""
|
| 570 |
+
analyzer = PeptideAnalyzer()
|
|
|
|
| 571 |
|
| 572 |
# Handle direct SMILES input
|
| 573 |
if smiles_input:
|
| 574 |
smiles = smiles_input.strip()
|
| 575 |
|
| 576 |
+
# First check if it's a peptide using analyzer's method
|
| 577 |
+
if not analyzer.is_peptide(smiles):
|
| 578 |
return "Error: Input SMILES does not appear to be a peptide structure.", None, None
|
| 579 |
|
| 580 |
try:
|
|
|
|
| 583 |
if mol is None:
|
| 584 |
return "Error: Invalid SMILES notation.", None, None
|
| 585 |
|
| 586 |
+
# Use analyzer to get sequence
|
| 587 |
+
segments = analyzer.split_on_bonds(smiles)
|
| 588 |
+
|
| 589 |
+
# Process segments and build sequence
|
| 590 |
+
sequence_parts = []
|
| 591 |
+
output_text = "Segment Analysis:\n"
|
| 592 |
+
for i, segment in enumerate(segments):
|
| 593 |
+
output_text += f"\nSegment {i}:\n"
|
| 594 |
+
output_text += f"Content: {segment['content']}\n"
|
| 595 |
+
output_text += f"Bond before: {segment.get('bond_before', 'None')}\n"
|
| 596 |
+
output_text += f"Bond after: {segment.get('bond_after', 'None')}\n"
|
| 597 |
+
|
| 598 |
+
residue, mods = analyzer.identify_residue(segment)
|
| 599 |
+
if residue:
|
| 600 |
+
if mods:
|
| 601 |
+
sequence_parts.append(f"{residue}({','.join(mods)})")
|
| 602 |
+
else:
|
| 603 |
+
sequence_parts.append(residue)
|
| 604 |
+
output_text += f"Identified as: {residue}\n"
|
| 605 |
+
output_text += f"Modifications: {mods}\n"
|
| 606 |
+
else:
|
| 607 |
+
output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n"
|
| 608 |
+
|
| 609 |
+
# Check if cyclic using analyzer's method
|
| 610 |
+
is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
|
| 611 |
+
sequence = f"cyclo({'-'.join(sequence_parts)})" if is_cyclic else '-'.join(sequence_parts)
|
| 612 |
|
| 613 |
# Create cyclic structure visualization
|
| 614 |
img_cyclic = annotate_cyclic_structure(mol, sequence)
|
|
|
|
| 617 |
img_linear = None
|
| 618 |
if show_linear:
|
| 619 |
fig_linear = create_enhanced_linear_viz(sequence, smiles)
|
|
|
|
|
|
|
| 620 |
buf = BytesIO()
|
| 621 |
fig_linear.savefig(buf, format='png', bbox_inches='tight', dpi=300)
|
| 622 |
buf.seek(0)
|
| 623 |
img_linear = Image.open(buf)
|
| 624 |
plt.close(fig_linear)
|
| 625 |
|
| 626 |
+
# Add summary to output
|
| 627 |
+
summary = f"\nSummary:\n"
|
| 628 |
+
summary += f"Sequence: {sequence}\n"
|
| 629 |
+
summary += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
|
| 630 |
+
if is_cyclic:
|
| 631 |
+
summary += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
|
| 632 |
+
summary += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
|
| 633 |
|
| 634 |
+
return summary + "\n" + output_text, img_cyclic, img_linear
|
| 635 |
|
| 636 |
except Exception as e:
|
| 637 |
return f"Error processing SMILES: {str(e)}", None, None
|
|
|
|
| 639 |
# Handle file input
|
| 640 |
if file_obj is not None:
|
| 641 |
try:
|
| 642 |
+
# Handle file content
|
| 643 |
+
if hasattr(file_obj, 'name'):
|
| 644 |
with open(file_obj.name, 'r') as f:
|
| 645 |
content = f.read()
|
| 646 |
+
else:
|
| 647 |
content = file_obj.decode('utf-8') if isinstance(file_obj, bytes) else str(file_obj)
|
| 648 |
|
| 649 |
output_text = ""
|
| 650 |
for line in content.splitlines():
|
| 651 |
smiles = line.strip()
|
| 652 |
if smiles:
|
| 653 |
+
# Check if it's a peptide
|
| 654 |
+
if not analyzer.is_peptide(smiles):
|
| 655 |
output_text += f"Skipping non-peptide SMILES: {smiles}\n"
|
| 656 |
continue
|
| 657 |
+
|
| 658 |
+
# Process this SMILES
|
| 659 |
+
segments = analyzer.split_on_bonds(smiles)
|
| 660 |
+
sequence_parts = []
|
| 661 |
+
for segment in segments:
|
| 662 |
+
residue, mods = analyzer.identify_residue(segment)
|
| 663 |
+
if residue:
|
| 664 |
+
if mods:
|
| 665 |
+
sequence_parts.append(f"{residue}({','.join(mods)})")
|
| 666 |
+
else:
|
| 667 |
+
sequence_parts.append(residue)
|
| 668 |
+
|
| 669 |
+
# Get cyclicity and create sequence
|
| 670 |
+
is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
|
| 671 |
+
sequence = f"cyclo({'-'.join(sequence_parts)})" if is_cyclic else '-'.join(sequence_parts)
|
| 672 |
+
|
| 673 |
+
output_text += f"SMILES: {smiles}\n"
|
| 674 |
+
output_text += f"Sequence: {sequence}\n"
|
| 675 |
+
output_text += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
|
| 676 |
+
if is_cyclic:
|
| 677 |
+
output_text += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
|
| 678 |
+
output_text += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
|
| 679 |
output_text += "-" * 50 + "\n"
|
| 680 |
+
|
| 681 |
return output_text, None, None
|
| 682 |
|
| 683 |
except Exception as e:
|
| 684 |
return f"Error processing file: {str(e)}", None, None
|
| 685 |
|
| 686 |
return "No input provided.", None, None
|
|
|
|
| 687 |
# Create Gradio interface with simplified examples
|
| 688 |
iface = gr.Interface(
|
| 689 |
fn=process_input,
|