File size: 5,040 Bytes
dbaa85f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import re
import requests
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from urllib.request import urlopen
from urllib.parse import quote
def is_smiles(text):
try:
m = Chem.MolFromSmiles(text, sanitize=False)
if m is None:
return False
return True
except:
return False
def is_multiple_smiles(text):
if is_smiles(text):
return "." in text
return False
def split_smiles(text):
return text.split(".")
def is_cas(text):
pattern = r"^\d{2,7}-\d{2}-\d$"
return re.match(pattern, text) is not None
def largest_mol(smiles):
ss = smiles.split(".")
ss.sort(key=lambda a: len(a))
while not is_smiles(ss[-1]):
rm = ss[-1]
ss.remove(rm)
return ss[-1]
def canonical_smiles(smiles):
try:
smi = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical=True)
return smi
except Exception:
return "Invalid SMILES string"
def tanimoto(s1, s2):
"""Calculate the Tanimoto similarity of two SMILES strings."""
try:
mol1 = Chem.MolFromSmiles(s1)
mol2 = Chem.MolFromSmiles(s2)
fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048)
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048)
return DataStructs.TanimotoSimilarity(fp1, fp2)
except (TypeError, ValueError, AttributeError):
return "Error: Not a valid SMILES string"
def CIRconvert(ids):
url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
ans = urlopen(url).read().decode('utf8')
return ans
def pubchem_query2smiles(
query: str,
url: str = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}",
) -> str:
if is_smiles(query):
if not is_multiple_smiles(query):
return query
else:
raise ValueError(
"Multiple SMILES strings detected, input one molecule at a time."
)
if url is None:
url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}"
r = requests.get(url.format(query, "property/IsomericSMILES/JSON"))
# convert the response to a json object
data = r.json()
# return the SMILES string
try:
smi = data["PropertyTable"]["Properties"][0]["IsomericSMILES"]
except:
try:
smi = CIRconvert(query)
except KeyError:
return "Could not find a molecule matching the text. One possible cause is that the input is incorrect, input one molecule at a time."
return str(Chem.CanonSmiles(largest_mol(smi)))
def query2cas(query: str, url_cid: str, url_data: str):
try:
mode = "name"
if is_smiles(query):
if is_multiple_smiles(query):
raise ValueError(
"Multiple SMILES strings detected, input one molecule at a time."
)
mode = "smiles"
url_cid = url_cid.format(mode, query)
cid = requests.get(url_cid).json()["IdentifierList"]["CID"][0]
url_data = url_data.format(cid)
data = requests.get(url_data).json()
except (requests.exceptions.RequestException, KeyError):
raise ValueError("Invalid molecule input, no Pubchem entry")
try:
for section in data["Record"]["Section"]:
if section.get("TOCHeading") == "Names and Identifiers":
for subsection in section["Section"]:
if subsection.get("TOCHeading") == "Other Identifiers":
for subsubsection in subsection["Section"]:
if subsubsection.get("TOCHeading") == "CAS":
return subsubsection["Information"][0]["Value"][
"StringWithMarkup"
][0]["String"]
except KeyError:
raise ValueError("Invalid molecule input, no Pubchem entry")
raise ValueError("CAS number not found")
def smiles2name(smi, single_name=True):
"""This function queries the given molecule smiles and returns a name record or iupac"""
try:
smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi), canonical=True)
except Exception:
raise ValueError("Invalid SMILES string")
# query the PubChem database
r = requests.get(
"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/"
+ smi
+ "/synonyms/JSON"
)
# convert the response to a json object
data = r.json()
# return the SMILES string
try:
if single_name:
index = 0
names = data["InformationList"]["Information"][0]["Synonym"]
while is_cas(name := names[index]):
index += 1
if index == len(names):
raise ValueError("No name found")
else:
name = data["InformationList"]["Information"][0]["Synonym"]
except KeyError:
raise ValueError("Unknown Molecule")
return name
|