|
|
import re |
|
|
|
|
|
import requests |
|
|
from rdkit import Chem, DataStructs |
|
|
from rdkit.Chem import AllChem |
|
|
|
|
|
from urllib.request import urlopen |
|
|
from urllib.parse import quote |
|
|
|
|
|
def is_smiles(text): |
|
|
try: |
|
|
m = Chem.MolFromSmiles(text, sanitize=False) |
|
|
if m is None: |
|
|
return False |
|
|
return True |
|
|
except: |
|
|
return False |
|
|
|
|
|
|
|
|
def is_multiple_smiles(text): |
|
|
if is_smiles(text): |
|
|
return "." in text |
|
|
return False |
|
|
|
|
|
|
|
|
def split_smiles(text): |
|
|
return text.split(".") |
|
|
|
|
|
|
|
|
def is_cas(text): |
|
|
pattern = r"^\d{2,7}-\d{2}-\d$" |
|
|
return re.match(pattern, text) is not None |
|
|
|
|
|
|
|
|
def largest_mol(smiles): |
|
|
ss = smiles.split(".") |
|
|
ss.sort(key=lambda a: len(a)) |
|
|
while not is_smiles(ss[-1]): |
|
|
rm = ss[-1] |
|
|
ss.remove(rm) |
|
|
return ss[-1] |
|
|
|
|
|
|
|
|
def canonical_smiles(smiles): |
|
|
try: |
|
|
smi = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical=True) |
|
|
return smi |
|
|
except Exception: |
|
|
return "Invalid SMILES string" |
|
|
|
|
|
|
|
|
def tanimoto(s1, s2): |
|
|
"""Calculate the Tanimoto similarity of two SMILES strings.""" |
|
|
try: |
|
|
mol1 = Chem.MolFromSmiles(s1) |
|
|
mol2 = Chem.MolFromSmiles(s2) |
|
|
fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048) |
|
|
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048) |
|
|
return DataStructs.TanimotoSimilarity(fp1, fp2) |
|
|
except (TypeError, ValueError, AttributeError): |
|
|
return "Error: Not a valid SMILES string" |
|
|
|
|
|
def CIRconvert(ids): |
|
|
|
|
|
url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles' |
|
|
ans = urlopen(url).read().decode('utf8') |
|
|
return ans |
|
|
|
|
|
|
|
|
|
|
|
def pubchem_query2smiles( |
|
|
query: str, |
|
|
url: str = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}", |
|
|
) -> str: |
|
|
if is_smiles(query): |
|
|
if not is_multiple_smiles(query): |
|
|
return query |
|
|
else: |
|
|
raise ValueError( |
|
|
"Multiple SMILES strings detected, input one molecule at a time." |
|
|
) |
|
|
if url is None: |
|
|
url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}" |
|
|
r = requests.get(url.format(query, "property/IsomericSMILES/JSON")) |
|
|
|
|
|
data = r.json() |
|
|
|
|
|
try: |
|
|
smi = data["PropertyTable"]["Properties"][0]["IsomericSMILES"] |
|
|
except: |
|
|
try: |
|
|
smi = CIRconvert(query) |
|
|
|
|
|
except KeyError: |
|
|
return "Could not find a molecule matching the text. One possible cause is that the input is incorrect, input one molecule at a time." |
|
|
return str(Chem.CanonSmiles(largest_mol(smi))) |
|
|
|
|
|
|
|
|
def query2cas(query: str, url_cid: str, url_data: str): |
|
|
try: |
|
|
mode = "name" |
|
|
if is_smiles(query): |
|
|
if is_multiple_smiles(query): |
|
|
raise ValueError( |
|
|
"Multiple SMILES strings detected, input one molecule at a time." |
|
|
) |
|
|
mode = "smiles" |
|
|
url_cid = url_cid.format(mode, query) |
|
|
cid = requests.get(url_cid).json()["IdentifierList"]["CID"][0] |
|
|
url_data = url_data.format(cid) |
|
|
data = requests.get(url_data).json() |
|
|
except (requests.exceptions.RequestException, KeyError): |
|
|
raise ValueError("Invalid molecule input, no Pubchem entry") |
|
|
|
|
|
try: |
|
|
for section in data["Record"]["Section"]: |
|
|
if section.get("TOCHeading") == "Names and Identifiers": |
|
|
for subsection in section["Section"]: |
|
|
if subsection.get("TOCHeading") == "Other Identifiers": |
|
|
for subsubsection in subsection["Section"]: |
|
|
if subsubsection.get("TOCHeading") == "CAS": |
|
|
return subsubsection["Information"][0]["Value"][ |
|
|
"StringWithMarkup" |
|
|
][0]["String"] |
|
|
except KeyError: |
|
|
raise ValueError("Invalid molecule input, no Pubchem entry") |
|
|
|
|
|
raise ValueError("CAS number not found") |
|
|
|
|
|
|
|
|
def smiles2name(smi, single_name=True): |
|
|
"""This function queries the given molecule smiles and returns a name record or iupac""" |
|
|
|
|
|
try: |
|
|
smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi), canonical=True) |
|
|
except Exception: |
|
|
raise ValueError("Invalid SMILES string") |
|
|
|
|
|
r = requests.get( |
|
|
"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/" |
|
|
+ smi |
|
|
+ "/synonyms/JSON" |
|
|
) |
|
|
|
|
|
data = r.json() |
|
|
|
|
|
try: |
|
|
if single_name: |
|
|
index = 0 |
|
|
names = data["InformationList"]["Information"][0]["Synonym"] |
|
|
while is_cas(name := names[index]): |
|
|
index += 1 |
|
|
if index == len(names): |
|
|
raise ValueError("No name found") |
|
|
else: |
|
|
name = data["InformationList"]["Information"][0]["Synonym"] |
|
|
except KeyError: |
|
|
raise ValueError("Unknown Molecule") |
|
|
return name |
|
|
|