ribesstefano commited on
Commit
7d38fe9
·
1 Parent(s): 2842604

Removed splitting bonds highlights and added examples.

Browse files
protac_splitter/display_utils.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import sys
 
3
  from typing import Optional
4
 
5
  from rdkit import Chem
@@ -15,7 +16,7 @@ def safe_display(*args):
15
  if 'ipykernel' in sys.modules:
16
  display(*args)
17
  else:
18
- print(*args)
19
 
20
 
21
  def display_mol(
@@ -28,7 +29,7 @@ def display_mol(
28
  ):
29
  """ Display a molecule in a Jupyter notebook. Useful for having """
30
  if mol is None:
31
- print('Molecule is None')
32
  return None
33
  if use_smiles_as_legend and legend is None:
34
  legend = Chem.MolToSmiles(mol)
@@ -97,7 +98,7 @@ def get_mapped_protac_img(
97
  return None
98
 
99
  if linker_smiles in ['[*:1][*:2]', '[*:2][*:1]']:
100
- print('WARNING. Linker is empty.')
101
  poi_attachment_idx = get_atom_idx_at_attachment(protac_mol, poi_mol, e3_mol)
102
  e3_attachment_idx = get_atom_idx_at_attachment(protac_mol, e3_mol, poi_mol)
103
  else:
@@ -118,9 +119,9 @@ def get_mapped_protac_img(
118
  if poi_attachment_idx is not None:
119
  if len(poi_attachment_idx) != 2:
120
  if linker_smiles in ['[*:1][*:2]', '[*:2][*:1]']:
121
- print(f'WARNING. Linker is empty, no highlighting will be showed for the POI.')
122
  else:
123
- print(f'WARNING. POI attachment points must be only two, got instead: {poi_attachment_idx}')
124
  else:
125
  poi_bond_idx = protac_mol.GetBondBetweenAtoms(*poi_attachment_idx).GetIdx()
126
  highlight_atoms += poi_attachment_idx
@@ -132,9 +133,9 @@ def get_mapped_protac_img(
132
  if e3_attachment_idx is not None:
133
  if len(e3_attachment_idx) != 2:
134
  if linker_smiles in ['[*:1][*:2]', '[*:2][*:1]']:
135
- print(f'WARNING. Linker is empty, no highlighting will be showed for the E3.')
136
  else:
137
- print(f'WARNING. E3 attachment points must be only two, got instead: {e3_attachment_idx}')
138
  else:
139
  e3_bond_idx = protac_mol.GetBondBetweenAtoms(*e3_attachment_idx).GetIdx()
140
  highlight_atoms += e3_attachment_idx
 
1
  import os
2
  import sys
3
+ import logging
4
  from typing import Optional
5
 
6
  from rdkit import Chem
 
16
  if 'ipykernel' in sys.modules:
17
  display(*args)
18
  else:
19
+ logging.warning(*args)
20
 
21
 
22
  def display_mol(
 
29
  ):
30
  """ Display a molecule in a Jupyter notebook. Useful for having """
31
  if mol is None:
32
+ logging.warning('Molecule is None')
33
  return None
34
  if use_smiles_as_legend and legend is None:
35
  legend = Chem.MolToSmiles(mol)
 
98
  return None
99
 
100
  if linker_smiles in ['[*:1][*:2]', '[*:2][*:1]']:
101
+ logging.warning('WARNING. Linker is empty.')
102
  poi_attachment_idx = get_atom_idx_at_attachment(protac_mol, poi_mol, e3_mol)
103
  e3_attachment_idx = get_atom_idx_at_attachment(protac_mol, e3_mol, poi_mol)
104
  else:
 
119
  if poi_attachment_idx is not None:
120
  if len(poi_attachment_idx) != 2:
121
  if linker_smiles in ['[*:1][*:2]', '[*:2][*:1]']:
122
+ logging.warning(f'WARNING. Linker is empty, no highlighting will be showed for the POI.')
123
  else:
124
+ logging.warning(f'WARNING. POI attachment points must be only two, got instead: {poi_attachment_idx}')
125
  else:
126
  poi_bond_idx = protac_mol.GetBondBetweenAtoms(*poi_attachment_idx).GetIdx()
127
  highlight_atoms += poi_attachment_idx
 
133
  if e3_attachment_idx is not None:
134
  if len(e3_attachment_idx) != 2:
135
  if linker_smiles in ['[*:1][*:2]', '[*:2][*:1]']:
136
+ logging.warning(f'WARNING. Linker is empty, no highlighting will be showed for the E3.')
137
  else:
138
+ logging.warning(f'WARNING. E3 attachment points must be only two, got instead: {e3_attachment_idx}')
139
  else:
140
  e3_bond_idx = protac_mol.GetBondBetweenAtoms(*e3_attachment_idx).GetIdx()
141
  highlight_atoms += e3_attachment_idx
protac_splitter_app.py CHANGED
@@ -79,55 +79,61 @@ def process_single_smiles(protac_smiles: str, use_transformer: bool = False, use
79
  raise gr.Error(f"An error occurred while processing the input SMILES: {exception_message}", duration=10)
80
 
81
  valid_molecules = []
82
- pred_key = f'default_pred_n0'
83
  valid_molecules.append(results[pred_key])
84
 
85
  # Generate images and corresponding SMILES text
86
  images = []
87
- smiles_texts = []
88
  input_mol = Chem.MolFromSmiles(protac_smiles)
89
 
90
  if input_mol is not None:
91
  input_img = Draw.MolToImage(input_mol, legend="", size=(1000, 200))
92
  else:
93
- input_img = Image.new('RGB', (1000, 1000))
94
 
 
95
  splits = {}
96
  for smiles in results[pred_key].split("."):
97
  mol = Chem.MolFromSmiles(smiles)
98
  if mol:
99
  if "[*:1]" in smiles and "[*:2]" in smiles:
100
  legend = "Linker"
101
- splits['linker'] = smiles
102
  elif "[*:1]" in smiles:
103
  legend = "Warhead"
104
- splits['poi'] = smiles
105
  elif "[*:2]" in smiles:
106
  legend = "E3 Ligase Ligand"
107
- splits['e3'] = smiles
108
 
109
  img = Draw.MolToImage(mol, legend="", size=(1000, 1000))
110
  images.append(img)
111
- smiles_texts.append(f"{legend}: {smiles}")
112
- smiles_texts = "\n".join(smiles_texts)
113
-
114
- use_svg = False
115
- input_img = get_mapped_protac_img(
116
- protac_smiles=protac_smiles,
117
- poi_smiles=splits.get('poi', ''),
118
- linker_smiles=splits.get('linker', ''),
119
- e3_smiles=splits.get('e3', ''),
120
- w=1000,
121
- h=500,
122
- legend=None,
123
- useSVG=use_svg,
124
- )
125
-
126
- if use_svg:
127
- input_img = save_svg_to_tempfile(input_img)
128
- logging.debug(f"Returning processed image path: {input_img}")
129
-
130
- return input_img, list(images), smiles_texts
 
 
 
 
 
 
131
 
132
  def process_csv(
133
  file: gr.File,
@@ -198,8 +204,17 @@ def create_interface():
198
  Returns:
199
  gr.Blocks: The Gradio interface
200
  """
201
- with gr.Blocks() as demo:
202
- header = """# PROTAC-Splitter Web Application
 
 
 
 
 
 
 
 
 
203
 
204
  Upload a CSV file or enter a single SMILES string to predict PROTAC substructures.
205
 
@@ -208,35 +223,49 @@ Warheads and E3 ligase ligands connections to the linker are marked with dummy a
208
  - Warhead: `[*:1]`
209
  - E3 Ligase ligand: `[*:2]`
210
 
211
- """
212
- gr.Markdown(header)
 
 
 
 
 
 
 
 
 
 
213
 
 
214
  # Model selection section - common to both tabs
215
- model_selection = """## Model Selection
 
216
 
217
  You can choose which model to use for splitting PROTAC molecules:
218
 
219
  - **XGBoost model** (default): Fast graph-based edge classification model
220
- - **Transformer model**: More accurate but slower deep learning model
221
  - If both are selected, the Transformer model will be used first, then if it fails, the XGBoost model will be used.
222
  - If no model is selected, splitting will be done using graph-based heuristics, with no AI model involved.
223
 
224
- For fast splitting, we reccommend using the XGBoost model only, which is fast and efficient for most cases. The Transformer model might be more accurate but it is slower, especially for processing large CSV files.
225
- """
226
- gr.Markdown(model_selection)
 
227
  with gr.Row():
228
  with gr.Column(scale=2):
229
  with gr.Row():
230
  use_xgboost = gr.Checkbox(label="Use XGBoost model", value=True)
231
  use_transformer = gr.Checkbox(label="Use Transformer model", value=False)
232
 
 
233
  # Performance configuration section
234
- performance_configs = """### Performance Configurations
 
235
 
236
  Change the following parameters to optimize performance based on your machine's capabilities. Particularly useful when processing large CSV files or when using the Transformer model.
237
  For single SMILES processing, the default values should work well in most cases.
238
- """
239
- gr.Markdown(performance_configs)
240
  with gr.Column(scale=1):
241
  # Add a num_proc input
242
  with gr.Row():
@@ -285,32 +314,68 @@ For single SMILES processing, the default values should work well in most cases.
285
  outputs=[batch_size]
286
  )
287
 
 
288
  # Single SMILES Input tab
289
- gr.Markdown("## Specify Inputs")
 
 
 
 
 
 
290
  with gr.Tab("Single SMILES Input"):
291
  # Input area
292
  # NOTE: A challenging SMILES to test the app is: CC(C)(C)S(=O)(=O)c1cc2c(Nc3ccc4scnc4c3)ccnc2cc1OCCOCCCOCCOCC(=O)Nc1cccc2c1CN(C1CCC(=O)NC1=O)C2=O
293
  smiles_input = gr.Textbox(
294
- label="Enter SMILES String",
295
  placeholder="E.g., CC(C)(C)S(=O)(=O)c1cc2c(Nc3ccc4scnc4c3)ccnc2cc1OCCOCCOCCOCCOCC(=O)Nc1cccc2c1CN(C1CCC(=O)NC1=O)C2=O",
296
- # value="CC(C)(C)S(=O)(=O)c1cc2c(Nc3ccc4scnc4c3)ccnc2cc1OCCOCCOCCOCCOCC(=O)Nc1cccc2c1CN(C1CCC(=O)NC1=O)C2=O",
297
  )
298
-
299
  submit_smiles = gr.Button("Process SMILES")
300
 
301
  # Output area
302
- smiles_input_image = gr.Image(label="Input PROTAC", type="filepath") # Use None to allow SVG input
303
- smiles_output_images = gr.Gallery(label="Valid Splits", columns=3)
304
- smiles_output_texts = gr.Textbox(label="SMILES of the Splits", interactive=False, lines=3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
  # Connect the button click event to the processing function
307
  submit_smiles.click(
308
  process_single_smiles,
309
  inputs=[smiles_input, use_transformer, use_xgboost, beam_size],
310
- outputs=[smiles_input_image, smiles_output_images, smiles_output_texts]
311
  )
312
 
 
313
  # CSV file processing tab
 
314
  with gr.Tab("Upload CSV"):
315
  # File upload area
316
  file_input = gr.File(label="Upload CSV File")
@@ -331,13 +396,12 @@ For single SMILES processing, the default values should work well in most cases.
331
  outputs=[download_output]
332
  )
333
 
334
- csv_notes = f"""**Note:** The output CSV will contain the following columns:
335
 
336
- - `{smiles_column}`: The original PROTAC SMILES string
337
  - `default_pred_n0`: The predicted SMILES strings for the splits
338
  - `model_name`: The model used for the prediction
339
- """
340
- gr.Markdown(csv_notes)
341
 
342
  return demo
343
 
 
79
  raise gr.Error(f"An error occurred while processing the input SMILES: {exception_message}", duration=10)
80
 
81
  valid_molecules = []
82
+ pred_key = f"default_pred_n0"
83
  valid_molecules.append(results[pred_key])
84
 
85
  # Generate images and corresponding SMILES text
86
  images = []
 
87
  input_mol = Chem.MolFromSmiles(protac_smiles)
88
 
89
  if input_mol is not None:
90
  input_img = Draw.MolToImage(input_mol, legend="", size=(1000, 200))
91
  else:
92
+ input_img = Image.new("RGB", (1000, 1000))
93
 
94
+ smiles_texts = []
95
  splits = {}
96
  for smiles in results[pred_key].split("."):
97
  mol = Chem.MolFromSmiles(smiles)
98
  if mol:
99
  if "[*:1]" in smiles and "[*:2]" in smiles:
100
  legend = "Linker"
101
+ splits["linker"] = smiles
102
  elif "[*:1]" in smiles:
103
  legend = "Warhead"
104
+ splits["poi"] = smiles
105
  elif "[*:2]" in smiles:
106
  legend = "E3 Ligase Ligand"
107
+ splits["e3"] = smiles
108
 
109
  img = Draw.MolToImage(mol, legend="", size=(1000, 1000))
110
  images.append(img)
111
+ # smiles_texts.append(f"{legend}: {smiles}")
112
+ smiles_texts.append(smiles)
113
+
114
+ smiles_texts = ".".join(smiles_texts)
115
+ smiles_df = pd.DataFrame({
116
+ "Substructure": ["E3 Ligase Ligand", "Linker", "Warhead"],
117
+ "SMILES": [splits.get("e3", ""), splits.get("linker", ""), splits.get("poi", "")]
118
+ })
119
+
120
+ # use_svg = False
121
+ # input_img = get_mapped_protac_img(
122
+ # protac_smiles=protac_smiles,
123
+ # poi_smiles=splits.get('poi', ''),
124
+ # linker_smiles=splits.get('linker', ''),
125
+ # e3_smiles=splits.get('e3', ''),
126
+ # w=1000,
127
+ # h=500,
128
+ # legend=None,
129
+ # useSVG=use_svg,
130
+ # )
131
+ #
132
+ # if use_svg:
133
+ # input_img = save_svg_to_tempfile(input_img)
134
+ # logging.debug(f"Returning processed image path: {input_img}")
135
+
136
+ return input_img, list(images), smiles_texts, smiles_df
137
 
138
  def process_csv(
139
  file: gr.File,
 
204
  Returns:
205
  gr.Blocks: The Gradio interface
206
  """
207
+ css = """
208
+ h1 {
209
+ text-align: center;
210
+ display:block;
211
+ }
212
+ """
213
+ with gr.Blocks(css=css) as demo:
214
+ # ----------------------------------------------------------------------
215
+ # Application title and description
216
+ # ----------------------------------------------------------------------
217
+ gr.Markdown("""# ✂️ PROTAC-Splitter Web Application ✂️
218
 
219
  Upload a CSV file or enter a single SMILES string to predict PROTAC substructures.
220
 
 
223
  - Warhead: `[*:1]`
224
  - E3 Ligase ligand: `[*:2]`
225
 
226
+ If you find this tool useful, please consider citing the following paper:
227
+
228
+ ```
229
+ @article{ribes2025protac,
230
+ title={PROTAC-Splitter...},
231
+ author={Ribes, Stefano and others},
232
+ journal={Journal of...},
233
+ year={2025},
234
+ publisher={...}
235
+ }
236
+ ```
237
+ """)
238
 
239
+ # ----------------------------------------------------------------------
240
  # Model selection section - common to both tabs
241
+ # ----------------------------------------------------------------------
242
+ gr.Markdown("""## Model Selection
243
 
244
  You can choose which model to use for splitting PROTAC molecules:
245
 
246
  - **XGBoost model** (default): Fast graph-based edge classification model
247
+ - **Transformer model**: Often more accurate, but slower deep learning model
248
  - If both are selected, the Transformer model will be used first, then if it fails, the XGBoost model will be used.
249
  - If no model is selected, splitting will be done using graph-based heuristics, with no AI model involved.
250
 
251
+ For fast splitting, we reccommend using the XGBoost model only, which is fast and efficient for most cases.
252
+
253
+ The Transformer model runs on CPU, so it is slower, especially for processing large CSV files.
254
+ """)
255
  with gr.Row():
256
  with gr.Column(scale=2):
257
  with gr.Row():
258
  use_xgboost = gr.Checkbox(label="Use XGBoost model", value=True)
259
  use_transformer = gr.Checkbox(label="Use Transformer model", value=False)
260
 
261
+ # ----------------------------------------------------------------------
262
  # Performance configuration section
263
+ # ----------------------------------------------------------------------
264
+ gr.Markdown("""### Performance Configurations
265
 
266
  Change the following parameters to optimize performance based on your machine's capabilities. Particularly useful when processing large CSV files or when using the Transformer model.
267
  For single SMILES processing, the default values should work well in most cases.
268
+ """)
 
269
  with gr.Column(scale=1):
270
  # Add a num_proc input
271
  with gr.Row():
 
314
  outputs=[batch_size]
315
  )
316
 
317
+ # ----------------------------------------------------------------------
318
  # Single SMILES Input tab
319
+ # ----------------------------------------------------------------------
320
+ gr.Markdown("""## Specify Inputs
321
+
322
+ **Disclaimer**: The input SMILES is checked for validity before processing. There is no check on whether the SMILES is a PROTAC-like molecule or not.
323
+ For example, attempting to split the SMILES `c1ccccc` (benzene) with the XGBoost or heuristic strategies will return an error, as ring bonds are ignored for splitting.
324
+ On the other end, `c1ccccc1CCC1CCCC1` will return a plausible split, even though it is not a PROTAC molecule.
325
+ """)
326
  with gr.Tab("Single SMILES Input"):
327
  # Input area
328
  # NOTE: A challenging SMILES to test the app is: CC(C)(C)S(=O)(=O)c1cc2c(Nc3ccc4scnc4c3)ccnc2cc1OCCOCCCOCCOCC(=O)Nc1cccc2c1CN(C1CCC(=O)NC1=O)C2=O
329
  smiles_input = gr.Textbox(
330
+ label="Enter SMILES String",
331
  placeholder="E.g., CC(C)(C)S(=O)(=O)c1cc2c(Nc3ccc4scnc4c3)ccnc2cc1OCCOCCOCCOCCOCC(=O)Nc1cccc2c1CN(C1CCC(=O)NC1=O)C2=O",
 
332
  )
 
333
  submit_smiles = gr.Button("Process SMILES")
334
 
335
  # Output area
336
+ smiles_input_image = gr.Image(label="Input PROTAC")
337
+ smiles_output_images = gr.Gallery(
338
+ label="Predicted Splits",
339
+ columns=3,
340
+ )
341
+ smiles_output_df = gr.DataFrame(
342
+ label="Substructure Predictions",
343
+ interactive=False,
344
+ headers=["Substructure", "SMILES"],
345
+ show_copy_button=True,
346
+ )
347
+ smiles_output_texts = gr.Textbox(
348
+ label="SMILES of the Splits",
349
+ interactive=False,
350
+ lines=1,
351
+ show_copy_button=True,
352
+ )
353
+
354
+ # Add this Examples component
355
+ gr.Examples(
356
+ examples=[
357
+ # SMILES, use_transformer, use_xgboost, beam_size
358
+ ["CC(C)(C)S(=O)(=O)c1cc2c(Nc3ccc4scnc4c3)ccnc2cc1OCCOCCOCCOCCOCC(=O)Nc1cccc2c1CN(C1CCC(=O)NC1=O)C2=O", False, True, 5],
359
+ ["Cc1nnc2n1-c1sc(C#Cc3cnn(-c4cccc5c4C(=O)N(C4CCC(=O)NC4=O)C5=O)c3)c(Cc3ccccc3)c1COC2", False, True, 5],
360
+ ["c1ccccc1CCC1CCCC1", False, False, 5],
361
+ ["O=C(NCCOCCOCCN1CCCC1)Nc1cccc2c1CN(C1CCC(=O)NC1=O)C2=O", False, False, 5],
362
+ ],
363
+ inputs=[smiles_input, use_transformer, use_xgboost, beam_size],
364
+ outputs=[smiles_input_image, smiles_output_images, smiles_output_texts, smiles_output_df],
365
+ fn=process_single_smiles,
366
+ cache_examples=True,
367
+ )
368
 
369
  # Connect the button click event to the processing function
370
  submit_smiles.click(
371
  process_single_smiles,
372
  inputs=[smiles_input, use_transformer, use_xgboost, beam_size],
373
+ outputs=[smiles_input_image, smiles_output_images, smiles_output_texts, smiles_output_df]
374
  )
375
 
376
+ # ----------------------------------------------------------------------
377
  # CSV file processing tab
378
+ # ----------------------------------------------------------------------
379
  with gr.Tab("Upload CSV"):
380
  # File upload area
381
  file_input = gr.File(label="Upload CSV File")
 
396
  outputs=[download_output]
397
  )
398
 
399
+ gr.Markdown(f"""**Note:** The output CSV will contain the following columns:
400
 
401
+ - `smiles_column`: The original PROTAC SMILES string
402
  - `default_pred_n0`: The predicted SMILES strings for the splits
403
  - `model_name`: The model used for the prediction
404
+ """)
 
405
 
406
  return demo
407