clarindasusan commited on
Commit
bfab08d
·
1 Parent(s): 0f3733b

Replacing QML file

Browse files
Files changed (1) hide show
  1. app/utils.py +158 -19
app/utils.py CHANGED
@@ -2,6 +2,7 @@ import numpy as np
2
  from rdkit import Chem
3
  from rdkit.Chem import Descriptors, AllChem
4
  import logging
 
5
 
6
  import os
7
  from dotenv import load_dotenv
@@ -121,16 +122,111 @@ def validate_smiles(smiles: str) -> bool:
121
  return False
122
 
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  def repair_smiles(smiles: str, verbose: bool = False):
125
  """
126
  Attempt to repair and validate SMILES strings with multiple fallback strategies.
127
 
128
- This function tries multiple strategies to parse and repair SMILES:
129
  1. Direct parsing (most SMILES are already valid)
130
- 2. Parse without sanitization, then sanitize
131
- 3. Partial sanitization with error catching
132
- 4. Remove problematic stereochemistry markers
133
- 5. InChI round-trip (last resort)
 
134
 
135
  Args:
136
  smiles: Input SMILES string
@@ -156,7 +252,24 @@ def repair_smiles(smiles: str, verbose: bool = False):
156
  if verbose:
157
  logger.debug(f"Strategy 1 failed: {e}")
158
 
159
- # Strategy 2: Parse without sanitization, then sanitize carefully
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  try:
161
  mol = Chem.MolFromSmiles(smiles, sanitize=False)
162
  if mol is not None:
@@ -179,13 +292,16 @@ def repair_smiles(smiles: str, verbose: bool = False):
179
  pass
180
  except Exception as e:
181
  if verbose:
182
- logger.debug(f"Strategy 2 failed: {e}")
183
 
184
- # Strategy 3: Try removing stereochemistry markers that might be invalid
185
  if '/' in smiles or '\\' in smiles or '@' in smiles:
186
  try:
187
- # Remove stereochemistry markers
188
  cleaned = smiles.replace('/', '').replace('\\', '').replace('@', '')
 
 
 
 
189
  mol = Chem.MolFromSmiles(cleaned)
190
  if mol is not None:
191
  canonical = Chem.MolToSmiles(mol, canonical=True)
@@ -194,18 +310,36 @@ def repair_smiles(smiles: str, verbose: bool = False):
194
  return canonical
195
  except Exception as e:
196
  if verbose:
197
- logger.debug(f"Strategy 3 failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
- # Strategy 4: Try fixing common notation issues
200
  try:
201
- # Common issues: extra spaces, weird brackets, etc.
202
- cleaned = smiles.replace(' ', '')
 
 
203
 
204
- # Try parsing the cleaned version
205
  mol = Chem.MolFromSmiles(cleaned, sanitize=False)
206
  if mol is not None:
207
  try:
208
- # Try to kekulize and clean up
209
  Chem.Kekulize(mol, clearAromaticFlags=True)
210
  mol = Chem.RemoveHs(mol)
211
  canonical = Chem.MolToSmiles(mol, canonical=True)
@@ -213,7 +347,6 @@ def repair_smiles(smiles: str, verbose: bool = False):
213
  logger.info(f"✓ Repaired with kekulization: {smiles[:50]}")
214
  return canonical
215
  except Exception:
216
- # Even if kekulization fails, try to get SMILES
217
  try:
218
  canonical = Chem.MolToSmiles(mol, canonical=True)
219
  if canonical:
@@ -224,9 +357,9 @@ def repair_smiles(smiles: str, verbose: bool = False):
224
  pass
225
  except Exception as e:
226
  if verbose:
227
- logger.debug(f"Strategy 4 failed: {e}")
228
 
229
- # Strategy 5: Try InChI round-trip (last resort)
230
  try:
231
  mol = Chem.MolFromSmiles(smiles, sanitize=False)
232
  if mol is not None:
@@ -240,7 +373,7 @@ def repair_smiles(smiles: str, verbose: bool = False):
240
  return canonical
241
  except Exception as e:
242
  if verbose:
243
- logger.debug(f"Strategy 5 failed: {e}")
244
 
245
  # All strategies failed
246
  if verbose:
@@ -296,11 +429,17 @@ def get_smiles_info(smiles: str) -> dict:
296
  "is_valid": False,
297
  "can_parse": False,
298
  "can_sanitize": False,
 
 
299
  "repaired": None,
300
  "error": None
301
  }
302
 
303
  try:
 
 
 
 
304
  # Can we parse it?
305
  mol = Chem.MolFromSmiles(smiles, sanitize=False)
306
  result["can_parse"] = mol is not None
 
2
  from rdkit import Chem
3
  from rdkit.Chem import Descriptors, AllChem
4
  import logging
5
+ import re
6
 
7
  import os
8
  from dotenv import load_dotenv
 
122
  return False
123
 
124
 
125
+ def fix_parentheses(smiles: str) -> str:
126
+ """
127
+ Attempt to fix mismatched parentheses in SMILES string.
128
+
129
+ Args:
130
+ smiles: SMILES string with potential parentheses issues
131
+
132
+ Returns:
133
+ SMILES string with balanced parentheses
134
+ """
135
+ # Count parentheses
136
+ open_count = smiles.count('(')
137
+ close_count = smiles.count(')')
138
+
139
+ if open_count == close_count:
140
+ return smiles
141
+
142
+ # More opening than closing - add closing parentheses
143
+ if open_count > close_count:
144
+ return smiles + ')' * (open_count - close_count)
145
+
146
+ # More closing than opening - try to remove extra closing or add opening
147
+ # This is trickier, try removing trailing closing parens
148
+ if close_count > open_count:
149
+ diff = close_count - open_count
150
+ # Remove extra closing parentheses from the end
151
+ result = smiles.rstrip(')')
152
+ expected_close = close_count - diff
153
+ result = result + ')' * expected_close
154
+ return result
155
+
156
+ return smiles
157
+
158
+
159
+ def fix_brackets(smiles: str) -> str:
160
+ """
161
+ Attempt to fix mismatched brackets in SMILES string.
162
+
163
+ Args:
164
+ smiles: SMILES string with potential bracket issues
165
+
166
+ Returns:
167
+ SMILES string with balanced brackets
168
+ """
169
+ open_count = smiles.count('[')
170
+ close_count = smiles.count(']')
171
+
172
+ if open_count == close_count:
173
+ return smiles
174
+
175
+ if open_count > close_count:
176
+ return smiles + ']' * (open_count - close_count)
177
+
178
+ if close_count > open_count:
179
+ # Remove extra closing brackets from the end
180
+ diff = close_count - open_count
181
+ result = smiles.rstrip(']')
182
+ expected_close = close_count - diff
183
+ result = result + ']' * expected_close
184
+ return result
185
+
186
+ return smiles
187
+
188
+
189
+ def truncate_invalid_suffix(smiles: str) -> str:
190
+ """
191
+ If SMILES has a clearly incomplete suffix, try to truncate it.
192
+
193
+ Args:
194
+ smiles: SMILES string
195
+
196
+ Returns:
197
+ Truncated SMILES if applicable
198
+ """
199
+ # If it ends with an opening parenthesis or bracket, remove incomplete part
200
+ if smiles.endswith('('):
201
+ return smiles[:-1]
202
+ if smiles.endswith('['):
203
+ return smiles[:-1]
204
+
205
+ # Find the last valid ring closure or complete structure
206
+ # This is a heuristic - look for common patterns
207
+ for i in range(len(smiles) - 1, 0, -1):
208
+ test_smiles = smiles[:i]
209
+ try:
210
+ mol = Chem.MolFromSmiles(test_smiles, sanitize=False)
211
+ if mol is not None:
212
+ return test_smiles
213
+ except:
214
+ continue
215
+
216
+ return smiles
217
+
218
+
219
  def repair_smiles(smiles: str, verbose: bool = False):
220
  """
221
  Attempt to repair and validate SMILES strings with multiple fallback strategies.
222
 
223
+ This function tries multiple strategies including structural repair:
224
  1. Direct parsing (most SMILES are already valid)
225
+ 2. Fix parentheses/brackets
226
+ 3. Remove stereochemistry
227
+ 4. Truncate incomplete suffixes
228
+ 5. Parse without sanitization
229
+ 6. InChI round-trip
230
 
231
  Args:
232
  smiles: Input SMILES string
 
252
  if verbose:
253
  logger.debug(f"Strategy 1 failed: {e}")
254
 
255
+ # Strategy 2: Fix structural issues (parentheses, brackets)
256
+ try:
257
+ # Fix parentheses
258
+ fixed = fix_parentheses(smiles)
259
+ fixed = fix_brackets(fixed)
260
+
261
+ if fixed != smiles:
262
+ mol = Chem.MolFromSmiles(fixed)
263
+ if mol is not None:
264
+ canonical = Chem.MolToSmiles(mol, canonical=True)
265
+ if verbose:
266
+ logger.info(f"✓ Repaired by fixing parentheses/brackets: {smiles[:50]}")
267
+ return canonical
268
+ except Exception as e:
269
+ if verbose:
270
+ logger.debug(f"Strategy 2 failed: {e}")
271
+
272
+ # Strategy 3: Parse without sanitization, then sanitize carefully
273
  try:
274
  mol = Chem.MolFromSmiles(smiles, sanitize=False)
275
  if mol is not None:
 
292
  pass
293
  except Exception as e:
294
  if verbose:
295
+ logger.debug(f"Strategy 3 failed: {e}")
296
 
297
+ # Strategy 4: Try removing stereochemistry markers
298
  if '/' in smiles or '\\' in smiles or '@' in smiles:
299
  try:
 
300
  cleaned = smiles.replace('/', '').replace('\\', '').replace('@', '')
301
+ # Also try fixing parentheses on cleaned version
302
+ cleaned = fix_parentheses(cleaned)
303
+ cleaned = fix_brackets(cleaned)
304
+
305
  mol = Chem.MolFromSmiles(cleaned)
306
  if mol is not None:
307
  canonical = Chem.MolToSmiles(mol, canonical=True)
 
310
  return canonical
311
  except Exception as e:
312
  if verbose:
313
+ logger.debug(f"Strategy 4 failed: {e}")
314
+
315
+ # Strategy 5: Try truncating incomplete suffix
316
+ try:
317
+ truncated = truncate_invalid_suffix(smiles)
318
+ if truncated != smiles:
319
+ # Also fix parentheses on truncated version
320
+ truncated = fix_parentheses(truncated)
321
+ truncated = fix_brackets(truncated)
322
+
323
+ mol = Chem.MolFromSmiles(truncated)
324
+ if mol is not None:
325
+ canonical = Chem.MolToSmiles(mol, canonical=True)
326
+ if verbose:
327
+ logger.info(f"✓ Repaired by truncation: {smiles[:50]} -> {truncated[:50]}")
328
+ return canonical
329
+ except Exception as e:
330
+ if verbose:
331
+ logger.debug(f"Strategy 5 failed: {e}")
332
 
333
+ # Strategy 6: Try fixing common notation issues
334
  try:
335
+ # Remove spaces and fix double equals
336
+ cleaned = smiles.replace(' ', '').replace('(=O)(=O)', '(=O)')
337
+ cleaned = fix_parentheses(cleaned)
338
+ cleaned = fix_brackets(cleaned)
339
 
 
340
  mol = Chem.MolFromSmiles(cleaned, sanitize=False)
341
  if mol is not None:
342
  try:
 
343
  Chem.Kekulize(mol, clearAromaticFlags=True)
344
  mol = Chem.RemoveHs(mol)
345
  canonical = Chem.MolToSmiles(mol, canonical=True)
 
347
  logger.info(f"✓ Repaired with kekulization: {smiles[:50]}")
348
  return canonical
349
  except Exception:
 
350
  try:
351
  canonical = Chem.MolToSmiles(mol, canonical=True)
352
  if canonical:
 
357
  pass
358
  except Exception as e:
359
  if verbose:
360
+ logger.debug(f"Strategy 6 failed: {e}")
361
 
362
+ # Strategy 7: Try InChI round-trip (last resort)
363
  try:
364
  mol = Chem.MolFromSmiles(smiles, sanitize=False)
365
  if mol is not None:
 
373
  return canonical
374
  except Exception as e:
375
  if verbose:
376
+ logger.debug(f"Strategy 7 failed: {e}")
377
 
378
  # All strategies failed
379
  if verbose:
 
429
  "is_valid": False,
430
  "can_parse": False,
431
  "can_sanitize": False,
432
+ "has_paren_issues": False,
433
+ "has_bracket_issues": False,
434
  "repaired": None,
435
  "error": None
436
  }
437
 
438
  try:
439
+ # Check for structural issues
440
+ result["has_paren_issues"] = smiles.count('(') != smiles.count(')')
441
+ result["has_bracket_issues"] = smiles.count('[') != smiles.count(']')
442
+
443
  # Can we parse it?
444
  mol = Chem.MolFromSmiles(smiles, sanitize=False)
445
  result["can_parse"] = mol is not None