cafierom commited on
Commit
51a0d60
·
verified ·
1 Parent(s): e29b036

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -7
app.py CHANGED
@@ -8,12 +8,31 @@ import random
8
  import deepchem
9
  from rdkit import Chem
10
  from rdkit.Chem import Draw
 
11
 
12
  model_name = f"cafierom/bert-base-cased-ChemTok-ZN250K-V1"
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
  tokenizer = AutoTokenizer.from_pretrained(model_name,padding = True, truncation = True)
15
  mask_filler = pipeline("fill-mask", model_name)
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def tokenize(batch):
18
  return tokenizer(batch["text"], padding=True, truncation=True, max_length=250, return_special_tokens_mask=True)
19
 
@@ -141,6 +160,41 @@ def calc_qed(smiles):
141
  qed = [Chem.QED.default(mol) for mol in mols]
142
  return qed,mols
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  def gen_mask(smile_in: str, percent_mask: float) -> str:
145
  """
146
  Generate Analogues of a hit for hit expansion using generative mask-filling.
@@ -215,13 +269,116 @@ def gen_mask(smile_in: str, percent_mask: float) -> str:
215
  img = None
216
  return out_text,img
217
 
218
- gradio_app = gr.Interface(
219
- gen_mask,
220
- inputs=[gr.Textbox(label="SMILES for hit expansion"),gr.Radio(choices = [0.10, 0.15, 0.20],
221
- label="Fraction of hit molecule to mask.", value = 0.15,interactive=True)],
222
- outputs=[gr.Textbox(label="New Molecules: "),gr.Image(label="Molecule Images:")],
223
- title="Generate Analogues of a hit for hit expansion using generative mask-filling.",
224
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
  if __name__ == "__main__":
227
  gradio_app.launch(mcp_server=True)
 
8
  import deepchem
9
  from rdkit import Chem
10
  from rdkit.Chem import Draw
11
+ import regex as re
12
 
13
  model_name = f"cafierom/bert-base-cased-ChemTok-ZN250K-V1"
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
  tokenizer = AutoTokenizer.from_pretrained(model_name,padding = True, truncation = True)
16
  mask_filler = pipeline("fill-mask", model_name)
17
 
18
+ sub_locations_re = ["cc", #first unsubstituted carbons encountered
19
+ "c[1-9]cc", #unsubstituted carbon 2 of ring
20
+ "ccc[1-9]", #unsubstituted carbon 4 of ring
21
+ "c[1-9]c(\([A-Z]+\))?c", #carbon 2 of ring
22
+ "c[1-9]cc(\([A-Z]+\))?c", #carbon 3 of ring
23
+ "c[1-9]ccc(\([A-Z]+\))?c", #carbon 4 of ring
24
+ "c[1-9]cccc(\([A-Z]+\))?c", #carbon 5 of ring
25
+ "c[1-9]ccccc(\([A-Z]+\))?"] #carbon 6 of ring
26
+
27
+ sub_location_names = ["any unsubbed carbon","unsubbed carbon at C2", "unsubbed carbon at C4",
28
+ "substituent on C2","substituent on C3","substituent on C4","substituent on C5","substituent on C6"]
29
+
30
+ possible_sub_points = ["cc","c(O)c","c(OC)c"]
31
+
32
+ new_fragments = ["c(F)c","c(C#N)c","c(I)c","c([N+]([O-])=O)c","c(OC)c","c(Cl)c"]
33
+
34
+ new_fragment_names = ["Fluoro","Cyano","Iodo","Nitro","Methoxy","Chloro"]
35
+
36
  def tokenize(batch):
37
  return tokenizer(batch["text"], padding=True, truncation=True, max_length=250, return_special_tokens_mask=True)
38
 
 
160
  qed = [Chem.QED.default(mol) for mol in mols]
161
  return qed,mols
162
 
163
+ def make_sub_string(match):
164
+ '''
165
+ accepts a match object and checks for the existence of a match with the possible
166
+ substitution point. If a match is found, creates and returns the substitution.
167
+
168
+ Args:
169
+ match: a regex object
170
+
171
+ Returns:
172
+ new_frag: the substituted string, or the original string if the substitution failed
173
+ '''
174
+ global could_not_match
175
+ global sub_point_stored
176
+ global new_fragment_stored
177
+
178
+ original_frag = match.group()
179
+
180
+ if sub_point_stored in original_frag:
181
+ new_frag = original_frag.replace(sub_point_stored,new_fragment_stored)
182
+ return new_frag
183
+ else:
184
+ could_not_match += 1 #make a list of what we can't match?
185
+
186
+ return match.group()
187
+
188
+ def hold_values(sub_point,new_fragment):
189
+ '''
190
+ stores the subsitutiton points and new fragments in global variables to
191
+ be used by the make_sub_string function
192
+ '''
193
+ global sub_point_stored
194
+ global new_fragment_stored
195
+ sub_point_stored = sub_point
196
+ new_fragment_stored = new_fragment
197
+
198
  def gen_mask(smile_in: str, percent_mask: float) -> str:
199
  """
200
  Generate Analogues of a hit for hit expansion using generative mask-filling.
 
269
  img = None
270
  return out_text,img
271
 
272
+ def sub_rings(smile_in: str, number_subs = 1) -> str:
273
+ '''
274
+ accepts a SMILES string and tries all posible substitutions indicated by the
275
+ possible_sub_points list and the new_fragments list. Specific cases of the
276
+ possible_sub_points list are found in the sub_locations_re list as regex. The
277
+ lists have corresponding name lists.
278
+
279
+ Args:
280
+ smile_in: a SMILES string
281
+ number_subs: the number of substitutions to make per molecule
282
+
283
+ Returns:
284
+ a text string with:
285
+ new_smiles: a list of all the generated molecules.
286
+ qeds: a list of the QED value for each molecule
287
+
288
+ img: an image of the molecules with legends.
289
+ '''
290
+ try:
291
+ new_smiles = []
292
+ new_legends = []
293
+ global could_not_match
294
+ could_not_match = 0
295
+
296
+ for sub_point in possible_sub_points:
297
+ if sub_point == "cc":
298
+ sub_locations = sub_locations_re[:3]
299
+ sub_names = sub_location_names[:3]
300
+ else:
301
+ sub_locations = sub_locations_re[3:]
302
+ sub_names = sub_location_names[3:]
303
+ for specific_frag, frag_name in zip(sub_locations,sub_names):
304
+ for new_fragment in new_fragments:
305
+
306
+ res = re.search("c[1-9]c(\([A-Z]+\))?c(\([A-Z]+\))?c(\([A-Z]+\))?c(\([A-Z]+\))?c[1-9]",smile_in)
307
+ if res:
308
+ if sub_point in res.group():
309
+ hold_values(sub_point,new_fragment)
310
+ new_mol = re.sub(specific_frag,make_sub_string,smile_in,number_subs)
311
+ if new_mol != smile_in and new_mol not in new_smiles:
312
+ new_smiles.append(new_mol)
313
+ substituent = new_fragment.strip("c()")
314
+ new_legends.append(f"{frag_name} substitution with {substituent}.")
315
+
316
+ qeds,mols = calc_qed(new_smiles)
317
+
318
+ out_text = f"Total SMILES generated for hit: {len(new_smiles)}\n"
319
+ out_text += "===================================================\n"
320
+ i = 1
321
+ for smile, qed in zip(new_smiles,qeds):
322
+ out_text += f"analogue {i}: {smile} with QED: {qed:.3f}\n"
323
+ i += 1
324
+
325
+ legends = [f"QED: {qed:.3f}\n"+legend for qed,legend in zip(qeds, new_legends)]
326
+
327
+ print(f"Could not match {could_not_match} requests.")
328
+
329
+ img = Draw.MolsToGridImage(mols, legends=legends, molsPerRow=3, subImgSize=(200,200),useSVG=False,returnPNG=False)
330
+
331
+ except:
332
+ out_text = "Invalid SMILES string"
333
+ img = None
334
+
335
+ return None, None
336
+
337
+ with gr.Blocks() as gradio_app:
338
+ gr.Markdown(
339
+ """
340
+ # Generate Analogues of a hit for hit expansion using generative mask-filling or
341
+ ring subsitutions.
342
+
343
+ - The hit molecule is input by the user; this molecule is then masked in different,
344
+ random ways. A model, cafierom/bert-base-cased-ChemTok-ZN250K-V1,
345
+ is used to generate SMILES strings for analogue molecules by unmasking the
346
+ hit molecule. All possibilities created by the generative mask-filling
347
+ are kept as long as the probability is greater than a cut-off, which is set
348
+ to 0.1 but which may be changed.
349
+
350
+ - The hit molecule may also be substituted with the groups in the new fragments list
351
+ on any phenyl ring at the points listed in sub location names list.
352
+
353
+ - The QED value, or quantitative estimate of druglikeness, a weighted average of
354
+ various ADME properties is also calculated. A value of 1.0 is perfect
355
+ drug-likeness, and a value of 0.0 is not drug-like. A value of about 0.5
356
+ is average for many drugs.
357
+ """)
358
+
359
+
360
+ smile = gr.Textbox(label="SMILES for hit expansion")
361
+ with gr.Row():
362
+ mask_btn = gr.Button("Generate analogues.")
363
+ sub_btn = gr.Button("Generate analogues.")
364
+
365
+ with gr.Row():
366
+ results = gr.Textbox(label="New Molecules: ")
367
+ mol_pic = gr.Image(label="Molecule Images:")
368
+
369
+
370
+ @mask_btn.click(inputs=[smile], outputs=[results, mol_pic])
371
+ def do_genmask(smile):
372
+ return gen_mask(smile)
373
+
374
+ @sub_btn.click(inputs=[smile], outputs=[results, mol_pic])
375
+ def do_subrings(smile):
376
+ return sub_rings(smile)
377
+
378
+ @smile.submit(inputs=[smile], outputs=[results, mol_pic])
379
+ def do_genmask(smile,struct_type):
380
+ return gen_mask(smile)
381
+
382
 
383
  if __name__ == "__main__":
384
  gradio_app.launch(mcp_server=True)