import re import requests from rdkit import Chem, DataStructs from rdkit.Chem import AllChem from urllib.request import urlopen from urllib.parse import quote def is_smiles(text): try: m = Chem.MolFromSmiles(text, sanitize=False) if m is None: return False return True except: return False def is_multiple_smiles(text): if is_smiles(text): return "." in text return False def split_smiles(text): return text.split(".") def is_cas(text): pattern = r"^\d{2,7}-\d{2}-\d$" return re.match(pattern, text) is not None def largest_mol(smiles): ss = smiles.split(".") ss.sort(key=lambda a: len(a)) while not is_smiles(ss[-1]): rm = ss[-1] ss.remove(rm) return ss[-1] def canonical_smiles(smiles): try: smi = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical=True) return smi except Exception: return "Invalid SMILES string" def tanimoto(s1, s2): """Calculate the Tanimoto similarity of two SMILES strings.""" try: mol1 = Chem.MolFromSmiles(s1) mol2 = Chem.MolFromSmiles(s2) fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048) fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048) return DataStructs.TanimotoSimilarity(fp1, fp2) except (TypeError, ValueError, AttributeError): return "Error: Not a valid SMILES string" def CIRconvert(ids): url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles' ans = urlopen(url).read().decode('utf8') return ans def pubchem_query2smiles( query: str, url: str = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}", ) -> str: if is_smiles(query): if not is_multiple_smiles(query): return query else: raise ValueError( "Multiple SMILES strings detected, input one molecule at a time." ) if url is None: url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}" r = requests.get(url.format(query, "property/IsomericSMILES/JSON")) # convert the response to a json object data = r.json() # return the SMILES string try: smi = data["PropertyTable"]["Properties"][0]["IsomericSMILES"] except: try: smi = CIRconvert(query) except KeyError: return "Could not find a molecule matching the text. One possible cause is that the input is incorrect, input one molecule at a time." return str(Chem.CanonSmiles(largest_mol(smi))) def query2cas(query: str, url_cid: str, url_data: str): try: mode = "name" if is_smiles(query): if is_multiple_smiles(query): raise ValueError( "Multiple SMILES strings detected, input one molecule at a time." ) mode = "smiles" url_cid = url_cid.format(mode, query) cid = requests.get(url_cid).json()["IdentifierList"]["CID"][0] url_data = url_data.format(cid) data = requests.get(url_data).json() except (requests.exceptions.RequestException, KeyError): raise ValueError("Invalid molecule input, no Pubchem entry") try: for section in data["Record"]["Section"]: if section.get("TOCHeading") == "Names and Identifiers": for subsection in section["Section"]: if subsection.get("TOCHeading") == "Other Identifiers": for subsubsection in subsection["Section"]: if subsubsection.get("TOCHeading") == "CAS": return subsubsection["Information"][0]["Value"][ "StringWithMarkup" ][0]["String"] except KeyError: raise ValueError("Invalid molecule input, no Pubchem entry") raise ValueError("CAS number not found") def smiles2name(smi, single_name=True): """This function queries the given molecule smiles and returns a name record or iupac""" try: smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi), canonical=True) except Exception: raise ValueError("Invalid SMILES string") # query the PubChem database r = requests.get( "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/" + smi + "/synonyms/JSON" ) # convert the response to a json object data = r.json() # return the SMILES string try: if single_name: index = 0 names = data["InformationList"]["Information"][0]["Synonym"] while is_cas(name := names[index]): index += 1 if index == len(names): raise ValueError("No name found") else: name = data["InformationList"]["Information"][0]["Synonym"] except KeyError: raise ValueError("Unknown Molecule") return name