Spaces:

jinysun
/

TeLLAgent

Build error

App Files Files Community

TeLLAgent / utils.py

jinysun

Upload 9 files

dbaa85f verified about 1 year ago

raw

history blame

5.04 kB

	import re

	import requests
	from rdkit import Chem, DataStructs
	from rdkit.Chem import AllChem

	from urllib.request import urlopen
	from urllib.parse import quote

	def is_smiles(text):
	try:
	m = Chem.MolFromSmiles(text, sanitize=False)
	if m is None:
	return False
	return True
	except:
	return False


	def is_multiple_smiles(text):
	if is_smiles(text):
	return "." in text
	return False


	def split_smiles(text):
	return text.split(".")


	def is_cas(text):
	pattern = r"^\d{2,7}-\d{2}-\d$"
	return re.match(pattern, text) is not None


	def largest_mol(smiles):
	ss = smiles.split(".")
	ss.sort(key=lambda a: len(a))
	while not is_smiles(ss[-1]):
	rm = ss[-1]
	ss.remove(rm)
	return ss[-1]


	def canonical_smiles(smiles):
	try:
	smi = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical=True)
	return smi
	except Exception:
	return "Invalid SMILES string"


	def tanimoto(s1, s2):
	"""Calculate the Tanimoto similarity of two SMILES strings."""
	try:
	mol1 = Chem.MolFromSmiles(s1)
	mol2 = Chem.MolFromSmiles(s2)
	fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048)
	fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048)
	return DataStructs.TanimotoSimilarity(fp1, fp2)
	except (TypeError, ValueError, AttributeError):
	return "Error: Not a valid SMILES string"

	def CIRconvert(ids):

	url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
	ans = urlopen(url).read().decode('utf8')
	return ans



	def pubchem_query2smiles(
	query: str,
	url: str = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}",
	) -> str:
	if is_smiles(query):
	if not is_multiple_smiles(query):
	return query
	else:
	raise ValueError(
	"Multiple SMILES strings detected, input one molecule at a time."
	)
	if url is None:
	url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}"
	r = requests.get(url.format(query, "property/IsomericSMILES/JSON"))
	# convert the response to a json object
	data = r.json()
	# return the SMILES string
	try:
	smi = data["PropertyTable"]["Properties"][0]["IsomericSMILES"]
	except:
	try:
	smi = CIRconvert(query)

	except KeyError:
	return "Could not find a molecule matching the text. One possible cause is that the input is incorrect, input one molecule at a time."
	return str(Chem.CanonSmiles(largest_mol(smi)))


	def query2cas(query: str, url_cid: str, url_data: str):
	try:
	mode = "name"
	if is_smiles(query):
	if is_multiple_smiles(query):
	raise ValueError(
	"Multiple SMILES strings detected, input one molecule at a time."
	)
	mode = "smiles"
	url_cid = url_cid.format(mode, query)
	cid = requests.get(url_cid).json()["IdentifierList"]["CID"][0]
	url_data = url_data.format(cid)
	data = requests.get(url_data).json()
	except (requests.exceptions.RequestException, KeyError):
	raise ValueError("Invalid molecule input, no Pubchem entry")

	try:
	for section in data["Record"]["Section"]:
	if section.get("TOCHeading") == "Names and Identifiers":
	for subsection in section["Section"]:
	if subsection.get("TOCHeading") == "Other Identifiers":
	for subsubsection in subsection["Section"]:
	if subsubsection.get("TOCHeading") == "CAS":
	return subsubsection["Information"][0]["Value"][
	"StringWithMarkup"
	][0]["String"]
	except KeyError:
	raise ValueError("Invalid molecule input, no Pubchem entry")

	raise ValueError("CAS number not found")


	def smiles2name(smi, single_name=True):
	"""This function queries the given molecule smiles and returns a name record or iupac"""

	try:
	smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi), canonical=True)
	except Exception:
	raise ValueError("Invalid SMILES string")
	# query the PubChem database
	r = requests.get(
	"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/"
	+ smi
	+ "/synonyms/JSON"
	)
	# convert the response to a json object
	data = r.json()
	# return the SMILES string
	try:
	if single_name:
	index = 0
	names = data["InformationList"]["Information"][0]["Synonym"]
	while is_cas(name := names[index]):
	index += 1
	if index == len(names):
	raise ValueError("No name found")
	else:
	name = data["InformationList"]["Information"][0]["Synonym"]
	except KeyError:
	raise ValueError("Unknown Molecule")
	return name