TeLLAgent / utils.py
jinysun's picture
Upload 9 files
dbaa85f verified
import re
import requests
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from urllib.request import urlopen
from urllib.parse import quote
def is_smiles(text):
try:
m = Chem.MolFromSmiles(text, sanitize=False)
if m is None:
return False
return True
except:
return False
def is_multiple_smiles(text):
if is_smiles(text):
return "." in text
return False
def split_smiles(text):
return text.split(".")
def is_cas(text):
pattern = r"^\d{2,7}-\d{2}-\d$"
return re.match(pattern, text) is not None
def largest_mol(smiles):
ss = smiles.split(".")
ss.sort(key=lambda a: len(a))
while not is_smiles(ss[-1]):
rm = ss[-1]
ss.remove(rm)
return ss[-1]
def canonical_smiles(smiles):
try:
smi = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical=True)
return smi
except Exception:
return "Invalid SMILES string"
def tanimoto(s1, s2):
"""Calculate the Tanimoto similarity of two SMILES strings."""
try:
mol1 = Chem.MolFromSmiles(s1)
mol2 = Chem.MolFromSmiles(s2)
fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048)
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048)
return DataStructs.TanimotoSimilarity(fp1, fp2)
except (TypeError, ValueError, AttributeError):
return "Error: Not a valid SMILES string"
def CIRconvert(ids):
url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
ans = urlopen(url).read().decode('utf8')
return ans
def pubchem_query2smiles(
query: str,
url: str = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}",
) -> str:
if is_smiles(query):
if not is_multiple_smiles(query):
return query
else:
raise ValueError(
"Multiple SMILES strings detected, input one molecule at a time."
)
if url is None:
url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}"
r = requests.get(url.format(query, "property/IsomericSMILES/JSON"))
# convert the response to a json object
data = r.json()
# return the SMILES string
try:
smi = data["PropertyTable"]["Properties"][0]["IsomericSMILES"]
except:
try:
smi = CIRconvert(query)
except KeyError:
return "Could not find a molecule matching the text. One possible cause is that the input is incorrect, input one molecule at a time."
return str(Chem.CanonSmiles(largest_mol(smi)))
def query2cas(query: str, url_cid: str, url_data: str):
try:
mode = "name"
if is_smiles(query):
if is_multiple_smiles(query):
raise ValueError(
"Multiple SMILES strings detected, input one molecule at a time."
)
mode = "smiles"
url_cid = url_cid.format(mode, query)
cid = requests.get(url_cid).json()["IdentifierList"]["CID"][0]
url_data = url_data.format(cid)
data = requests.get(url_data).json()
except (requests.exceptions.RequestException, KeyError):
raise ValueError("Invalid molecule input, no Pubchem entry")
try:
for section in data["Record"]["Section"]:
if section.get("TOCHeading") == "Names and Identifiers":
for subsection in section["Section"]:
if subsection.get("TOCHeading") == "Other Identifiers":
for subsubsection in subsection["Section"]:
if subsubsection.get("TOCHeading") == "CAS":
return subsubsection["Information"][0]["Value"][
"StringWithMarkup"
][0]["String"]
except KeyError:
raise ValueError("Invalid molecule input, no Pubchem entry")
raise ValueError("CAS number not found")
def smiles2name(smi, single_name=True):
"""This function queries the given molecule smiles and returns a name record or iupac"""
try:
smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi), canonical=True)
except Exception:
raise ValueError("Invalid SMILES string")
# query the PubChem database
r = requests.get(
"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/"
+ smi
+ "/synonyms/JSON"
)
# convert the response to a json object
data = r.json()
# return the SMILES string
try:
if single_name:
index = 0
names = data["InformationList"]["Information"][0]["Synonym"]
while is_cas(name := names[index]):
index += 1
if index == len(names):
raise ValueError("No name found")
else:
name = data["InformationList"]["Information"][0]["Synonym"]
except KeyError:
raise ValueError("Unknown Molecule")
return name