Spaces:

cafierom
/

CafChem-GenMask

Sleeping

App Files Files Community

cafierom commited on Jun 26, 2025

Commit

51a0d60

verified ·

1 Parent(s): e29b036

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -7

app.py CHANGED Viewed

@@ -8,12 +8,31 @@ import random
 import deepchem
 from rdkit import Chem
 from rdkit.Chem import Draw
 model_name = f"cafierom/bert-base-cased-ChemTok-ZN250K-V1"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 tokenizer = AutoTokenizer.from_pretrained(model_name,padding = True, truncation = True)
 mask_filler = pipeline("fill-mask", model_name)
 def tokenize(batch):
   return tokenizer(batch["text"], padding=True, truncation=True, max_length=250, return_special_tokens_mask=True)
@@ -141,6 +160,41 @@ def calc_qed(smiles):
   qed = [Chem.QED.default(mol) for mol in mols]
   return qed,mols
 def gen_mask(smile_in: str, percent_mask: float) -> str:
   """
   Generate Analogues of a hit for hit expansion using generative mask-filling.
@@ -215,13 +269,116 @@ def gen_mask(smile_in: str, percent_mask: float) -> str:
     img = None
   return out_text,img
-gradio_app = gr.Interface(
-    gen_mask,
-    inputs=[gr.Textbox(label="SMILES for hit expansion"),gr.Radio(choices = [0.10, 0.15, 0.20],
-                            label="Fraction of hit molecule to mask.", value = 0.15,interactive=True)],
-    outputs=[gr.Textbox(label="New Molecules: "),gr.Image(label="Molecule Images:")],
-    title="Generate Analogues of a hit for hit expansion using generative mask-filling.",
-)
 if __name__ == "__main__":
     gradio_app.launch(mcp_server=True)

 import deepchem
 from rdkit import Chem
 from rdkit.Chem import Draw
+import regex as re
 model_name = f"cafierom/bert-base-cased-ChemTok-ZN250K-V1"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 tokenizer = AutoTokenizer.from_pretrained(model_name,padding = True, truncation = True)
 mask_filler = pipeline("fill-mask", model_name)
+sub_locations_re = ["cc",                          #first unsubstituted carbons encountered
+                "c[1-9]cc",                        #unsubstituted carbon 2 of ring
+                "ccc[1-9]",                        #unsubstituted carbon 4 of ring
+                "c[1-9]c(\([A-Z]+\))?c",           #carbon 2 of ring
+                "c[1-9]cc(\([A-Z]+\))?c",          #carbon 3 of ring
+                "c[1-9]ccc(\([A-Z]+\))?c",         #carbon 4 of ring
+                "c[1-9]cccc(\([A-Z]+\))?c",        #carbon 5 of ring
+                "c[1-9]ccccc(\([A-Z]+\))?"]        #carbon 6 of ring
+sub_location_names = ["any unsubbed carbon","unsubbed carbon at C2", "unsubbed carbon at C4",
+                      "substituent on C2","substituent on C3","substituent on C4","substituent on C5","substituent on C6"]
+possible_sub_points = ["cc","c(O)c","c(OC)c"]
+new_fragments = ["c(F)c","c(C#N)c","c(I)c","c([N+]([O-])=O)c","c(OC)c","c(Cl)c"]
+new_fragment_names = ["Fluoro","Cyano","Iodo","Nitro","Methoxy","Chloro"]
 def tokenize(batch):
   return tokenizer(batch["text"], padding=True, truncation=True, max_length=250, return_special_tokens_mask=True)
   qed = [Chem.QED.default(mol) for mol in mols]
   return qed,mols
+def make_sub_string(match):
+  '''
+  accepts a match object and checks for the existence of a match with the possible
+  substitution point. If a match is found, creates and returns the substitution.
+    Args:
+      match: a regex object
+    Returns:
+      new_frag: the substituted string, or the original string if the substitution failed
+  '''
+  global could_not_match
+  global sub_point_stored
+  global new_fragment_stored
+  original_frag = match.group()
+  if sub_point_stored in original_frag:
+      new_frag = original_frag.replace(sub_point_stored,new_fragment_stored)
+      return new_frag
+  else:
+      could_not_match += 1     #make a list of what we can't match?
+      return match.group()
+def hold_values(sub_point,new_fragment):
+  '''
+  stores the subsitutiton points and new fragments in global variables to
+  be used by the make_sub_string function
+  '''
+  global sub_point_stored
+  global new_fragment_stored
+  sub_point_stored = sub_point
+  new_fragment_stored = new_fragment
 def gen_mask(smile_in: str, percent_mask: float) -> str:
   """
   Generate Analogues of a hit for hit expansion using generative mask-filling.
     img = None
   return out_text,img
+def sub_rings(smile_in: str, number_subs = 1) -> str:
+  '''
+  accepts a SMILES string and tries all posible substitutions indicated by the
+  possible_sub_points list and the new_fragments list. Specific cases of the
+  possible_sub_points list are found in the sub_locations_re list as regex. The
+  lists have corresponding name lists.
+    Args:
+      smile_in: a SMILES string
+      number_subs: the number of substitutions to make per molecule
+    Returns:
+      a text string with:
+          new_smiles: a list of all the generated molecules.
+          qeds: a list of the QED value for each molecule
+      img: an image of the molecules with legends.
+  '''
+  try:
+      new_smiles = []
+      new_legends = []
+      global could_not_match
+      could_not_match = 0
+      for sub_point in possible_sub_points:
+        if sub_point == "cc":
+          sub_locations = sub_locations_re[:3]
+          sub_names = sub_location_names[:3]
+        else:
+          sub_locations = sub_locations_re[3:]
+          sub_names = sub_location_names[3:]
+        for specific_frag, frag_name in zip(sub_locations,sub_names):
+          for new_fragment in new_fragments:
+              res = re.search("c[1-9]c(\([A-Z]+\))?c(\([A-Z]+\))?c(\([A-Z]+\))?c(\([A-Z]+\))?c[1-9]",smile_in)
+              if res:
+                  if sub_point in res.group():
+                      hold_values(sub_point,new_fragment)
+                      new_mol = re.sub(specific_frag,make_sub_string,smile_in,number_subs)
+                      if new_mol != smile_in and new_mol not in new_smiles:
+                          new_smiles.append(new_mol)
+                          substituent = new_fragment.strip("c()")
+                          new_legends.append(f"{frag_name} substitution with {substituent}.")
+      qeds,mols = calc_qed(new_smiles)
+      out_text = f"Total SMILES generated for hit: {len(new_smiles)}\n"
+      out_text += "===================================================\n"
+      i = 1
+      for smile, qed in zip(new_smiles,qeds):
+        out_text += f"analogue {i}: {smile} with QED: {qed:.3f}\n"
+        i += 1
+      legends = [f"QED: {qed:.3f}\n"+legend for qed,legend in zip(qeds, new_legends)]
+      print(f"Could not match {could_not_match} requests.")
+      img = Draw.MolsToGridImage(mols, legends=legends, molsPerRow=3, subImgSize=(200,200),useSVG=False,returnPNG=False)
+    except:
+      out_text = "Invalid SMILES string"
+      img = None
+    return None, None
+with gr.Blocks() as gradio_app:
+  gr.Markdown(
+      """
+      # Generate Analogues of a hit for hit expansion using generative mask-filling or
+      ring subsitutions.
+      - The hit molecule is input by the user; this molecule is then masked in different,
+        random ways. A model, cafierom/bert-base-cased-ChemTok-ZN250K-V1,
+        is used to generate SMILES strings for analogue molecules by unmasking the
+        hit molecule. All possibilities created by the generative mask-filling
+        are kept as long as the probability is greater than a cut-off, which is set
+        to 0.1 but which may be changed.
+      - The hit molecule may also be substituted with the groups in the new fragments list
+        on any phenyl ring at the points listed in sub location names list.
+      - The QED value, or quantitative estimate of druglikeness, a weighted average of
+        various ADME properties is also calculated. A value of 1.0 is perfect
+        drug-likeness, and a value of 0.0 is not drug-like. A value of about 0.5
+        is average for many drugs.
+      """)
+  smile = gr.Textbox(label="SMILES for hit expansion")
+  with gr.Row():
+    mask_btn = gr.Button("Generate analogues.")
+    sub_btn = gr.Button("Generate analogues.")
+  with gr.Row():
+    results = gr.Textbox(label="New Molecules: ")
+    mol_pic = gr.Image(label="Molecule Images:")
+  @mask_btn.click(inputs=[smile], outputs=[results, mol_pic])
+  def do_genmask(smile):
+    return gen_mask(smile)
+  @sub_btn.click(inputs=[smile], outputs=[results, mol_pic])
+  def do_subrings(smile):
+    return sub_rings(smile)
+  @smile.submit(inputs=[smile], outputs=[results, mol_pic])
+  def do_genmask(smile,struct_type):
+    return gen_mask(smile)
 if __name__ == "__main__":
     gradio_app.launch(mcp_server=True)