| import re |
|
|
| import requests |
| from rdkit import Chem, DataStructs |
| from rdkit.Chem import AllChem |
|
|
| from urllib.request import urlopen |
| from urllib.parse import quote |
|
|
| def is_smiles(text): |
| try: |
| m = Chem.MolFromSmiles(text, sanitize=False) |
| if m is None: |
| return False |
| return True |
| except: |
| return False |
|
|
|
|
| def is_multiple_smiles(text): |
| if is_smiles(text): |
| return "." in text |
| return False |
|
|
|
|
| def split_smiles(text): |
| return text.split(".") |
|
|
|
|
| def is_cas(text): |
| pattern = r"^\d{2,7}-\d{2}-\d$" |
| return re.match(pattern, text) is not None |
|
|
|
|
| def largest_mol(smiles): |
| ss = smiles.split(".") |
| ss.sort(key=lambda a: len(a)) |
| while not is_smiles(ss[-1]): |
| rm = ss[-1] |
| ss.remove(rm) |
| return ss[-1] |
|
|
|
|
| def canonical_smiles(smiles): |
| try: |
| smi = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical=True) |
| return smi |
| except Exception: |
| return "Invalid SMILES string" |
|
|
|
|
| def tanimoto(s1, s2): |
| """Calculate the Tanimoto similarity of two SMILES strings.""" |
| try: |
| mol1 = Chem.MolFromSmiles(s1) |
| mol2 = Chem.MolFromSmiles(s2) |
| fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048) |
| fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048) |
| return DataStructs.TanimotoSimilarity(fp1, fp2) |
| except (TypeError, ValueError, AttributeError): |
| return "Error: Not a valid SMILES string" |
|
|
| def CIRconvert(ids): |
| |
| url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles' |
| ans = urlopen(url).read().decode('utf8') |
| return ans |
| |
| |
| |
| def pubchem_query2smiles( |
| query: str, |
| url: str = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}", |
| ) -> str: |
| if is_smiles(query): |
| if not is_multiple_smiles(query): |
| return query |
| else: |
| raise ValueError( |
| "Multiple SMILES strings detected, input one molecule at a time." |
| ) |
| if url is None: |
| url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}" |
| r = requests.get(url.format(query, "property/IsomericSMILES/JSON")) |
| |
| data = r.json() |
| |
| try: |
| smi = data["PropertyTable"]["Properties"][0]["IsomericSMILES"] |
| except: |
| try: |
| smi = CIRconvert(query) |
| |
| except KeyError: |
| return "Could not find a molecule matching the text. One possible cause is that the input is incorrect, input one molecule at a time." |
| return str(Chem.CanonSmiles(largest_mol(smi))) |
|
|
|
|
| def query2cas(query: str, url_cid: str, url_data: str): |
| try: |
| mode = "name" |
| if is_smiles(query): |
| if is_multiple_smiles(query): |
| raise ValueError( |
| "Multiple SMILES strings detected, input one molecule at a time." |
| ) |
| mode = "smiles" |
| url_cid = url_cid.format(mode, query) |
| cid = requests.get(url_cid).json()["IdentifierList"]["CID"][0] |
| url_data = url_data.format(cid) |
| data = requests.get(url_data).json() |
| except (requests.exceptions.RequestException, KeyError): |
| raise ValueError("Invalid molecule input, no Pubchem entry") |
|
|
| try: |
| for section in data["Record"]["Section"]: |
| if section.get("TOCHeading") == "Names and Identifiers": |
| for subsection in section["Section"]: |
| if subsection.get("TOCHeading") == "Other Identifiers": |
| for subsubsection in subsection["Section"]: |
| if subsubsection.get("TOCHeading") == "CAS": |
| return subsubsection["Information"][0]["Value"][ |
| "StringWithMarkup" |
| ][0]["String"] |
| except KeyError: |
| raise ValueError("Invalid molecule input, no Pubchem entry") |
|
|
| raise ValueError("CAS number not found") |
|
|
|
|
| def smiles2name(smi, single_name=True): |
| """This function queries the given molecule smiles and returns a name record or iupac""" |
|
|
| try: |
| smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi), canonical=True) |
| except Exception: |
| raise ValueError("Invalid SMILES string") |
| |
| r = requests.get( |
| "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/" |
| + smi |
| + "/synonyms/JSON" |
| ) |
| |
| data = r.json() |
| |
| try: |
| if single_name: |
| index = 0 |
| names = data["InformationList"]["Information"][0]["Synonym"] |
| while is_cas(name := names[index]): |
| index += 1 |
| if index == len(names): |
| raise ValueError("No name found") |
| else: |
| name = data["InformationList"]["Information"][0]["Synonym"] |
| except KeyError: |
| raise ValueError("Unknown Molecule") |
| return name |
|
|