DeepMS2 / core /pycdk.py
kairongLi's picture
Upload 44 files
fb5f46a
import os
import jpype
import numpy as np
from jpype import java, isJVMStarted, startJVM, getDefaultJVMPath, JPackage
################################## Start JVM #########################################
cdk_path = os.path.join('cdk-2.2.jar')
jpype.addClassPath(cdk_path)
startJVM(getDefaultJVMPath(), "-ea", "-Djava.class.path=%s" % cdk_path)
cdk = JPackage('org').openscience.cdk
############################### Format Conversion #####################################
def MolFromSmiles(smi):
function = cdk.smiles.SmilesParser(cdk.DefaultChemObjectBuilder.getInstance())
try:
mol = function.parseSmiles(smi)
except:
raise IOError('invalid smiles input')
return mol
def MolFromInchi(inchi):
function = cdk.inchi.InChIGeneratorFactory.getInstance()
builder = cdk.DefaultChemObjectBuilder.getInstance()
s = function.getInChIToStructure(inchi, builder)
mol = s.getAtomContainer()
return mol
def MolFromFile(sdf):
file = java.io.File(sdf)
reader = cdk.io.ReaderFactory().createReader(java.io.FileReader(file))
builder = cdk.DefaultChemObjectBuilder.getInstance()
content = reader.read(builder.newInstance(cdk.interfaces.IChemFile))
mols = cdk.tools.manipulator.ChemFileManipulator.getAllAtomContainers(content)
return mols
def MolToSmiles(mol):
function = cdk.smiles.SmilesGenerator(cdk.smiles.SmiFlavor.Isomeric)
smi = function.create(mol)
return smi
def MolToInchi(mol):
function = cdk.inchi.InChIGeneratorFactory.getInstance()
inchi = function.getInChIGenerator(mol)
return inchi.getInchi()
def MolToInchiKey(mol):
function = cdk.inchi.InChIGeneratorFactory.getInstance()
inchi = function.getInChIGenerator(mol)
return inchi.getInchiKey()
def MolToMOPAC(mol):
output = java.io.StringWriter()
writer = cdk.io.program.Mopac7Writer(output)
writer.write(mol)
writer.close()
output = output.toString()
return output
def MolToSDF(mol):
output = java.io.StringWriter()
writer = cdk.io.SDFWriter(output)
writer.write(mol)
writer.close()
output = output.toString()
return output
############################# Molecular Properties ##################################
def MolToFormula(mol, string=True):
function = cdk.tools.manipulator.MolecularFormulaManipulator
gen = function.getMolecularFormula(mol)
if string:
output = function.getString(gen)
else:
output = gen
return output
def getMolExactMass(mol):
function = cdk.tools.manipulator.MolecularFormulaManipulator
formula = function.getMolecularFormula(mol)
ExactMass = function.getMajorIsotopeMass(formula)
return ExactMass
def getMolNaturalMass(mol):
function = cdk.tools.manipulator.AtomContainerManipulator
NaturalMass = function.getNaturalExactMass(mol)
return NaturalMass
def getMolTotalFormalCharge(mol):
function = cdk.tools.manipulator.AtomContainerManipulator
FormalCharge = function.getTotalFormalCharge(mol)
return FormalCharge
def getMolTotalNegativeFormalCharge(mol):
function = cdk.tools.manipulator.AtomContainerManipulator
NegativeFormalCharge = function.getTotalNegativeFormalCharge(mol)
return NegativeFormalCharge
def getMolTotalPositiveFormalCharge(mol):
function = cdk.tools.manipulator.AtomContainerManipulator
PositiveFormalCharge = function.getTotalPositiveFormalCharge(mol)
return PositiveFormalCharge
############################# Formula and Isotope ##################################
def FormulaFromString(string):
builder = cdk.formula.MolecularFormula().getBuilder()
formula = cdk.tools.manipulator.MolecularFormulaManipulator.getMolecularFormula(string, builder)
return formula
def FormulaToString(formula):
string = cdk.tools.manipulator.MolecularFormulaManipulator.getString(formula)
return string
def add_formula(string1, string2):
formula1 = FormulaFromString(string1)
formula2 = FormulaFromString(string2)
added = formula1.add(formula2)
return FormulaToString(added)
def subtract_formula(string1, string2):
parser1 = parser_formula(string1)
parser2 = parser_formula(string2)
for k in parser2.keys():
if k in parser1.keys():
parser1[k] -= parser2[k]
else:
print('forula2 is part of formula1')
return string1
if parser1[k] < 0:
print('forula2 is part of formula1')
return string1
string = ''
for k in parser1.keys():
string += k
string += str(parser1[k])
return FormulaToString(FormulaFromString(string))
def parser_formula(string):
formula = FormulaFromString(string)
iters = formula.isotopes()
size = formula.getIsotopeCount()
isotopes = iters.iterator()
output = {}
for i in range(size):
isotope = isotopes.next()
output[isotope.getSymbol()] = formula.getIsotopeCount(isotope)
return output
def getFormulaExactMass(string):
formula = FormulaFromString(string)
function = cdk.tools.manipulator.MolecularFormulaManipulator
ExactMass = function.getMajorIsotopeMass(formula)
return ExactMass
def getFormulaNaturalMass(string):
formula = FormulaFromString(string)
function = cdk.tools.manipulator.MolecularFormulaManipulator
NaturalMass = function.getNaturalExactMass(formula)
return NaturalMass
def getFormulaDBE(string):
formula = FormulaFromString(string)
function = cdk.tools.manipulator.MolecularFormulaManipulator
DBE = function. getDBE(formula)
return DBE
def IsotopeFromString(string, minI=0.01):
formula = FormulaFromString(string)
return IsotopeFromFormula(formula, minI)
def IsotopeFromFormula(formula, minI=0.01):
generator = cdk.formula.IsotopePatternGenerator(minI)
isotopes = generator.getIsotopes(formula)
isotopes = isotopes.getIsotopes()
output = [(i.getMass(), i.getIntensity()) for i in isotopes]
return np.array(output)
def IsotopeFromArray(array):
isotopes = cdk.formula.IsotopePattern()
manipulator = cdk.formula.IsotopePatternManipulator
container = cdk.formula.IsotopeContainer
for (mass, intensity) in array:
i = container(mass, intensity)
isotopes.addIsotope(i)
output = manipulator.normalize(isotopes)
output = manipulator.sortByMass(output)
return output
def IsotopeToArray(isotopes):
isotopes = isotopes.getIsotopes()
output = [(i.getMass(), i.getIntensity()) for i in isotopes]
return np.array(output)
def IsotopeSimilarity(isotope_array_1, isotope_array_2, tolerance_ppm=10):
isotope_1 = IsotopeFromArray(isotope_array_1)
isotope_2 = IsotopeFromArray(isotope_array_2)
function = cdk.formula.IsotopePatternSimilarity()
function.seTolerance(tolerance_ppm)
output = function.compare(isotope_1, isotope_2)
return output
def generate_formula(mass, window=0.01, atom_list = {'C': [0, 20], 'H': [0, 20], 'O': [0, 20], 'N': [0, 20], 'P': [0, 20], 'S': [0, 20]}, astring=True):
ifac = cdk.config.Isotopes.getInstance()
mfrange = cdk.formula.MolecularFormulaRange()
builder = cdk.formula.MolecularFormula().getBuilder()
generator = cdk.formula.MolecularFormulaGenerator
for atom, (minimum, maximum) in atom_list.items():
element = ifac.getMajorIsotope(atom)
mfrange.addIsotope(element, minimum, maximum)
formula = generator(builder, mass-window, mass+window, mfrange)
formula = formula.getAllFormulas()
formula = formula.molecularFormulas()
if astring:
formula = [FormulaToString(f) for f in formula]
return formula
def check_formula(formula, NitrogenRuleCheck=True, RDBERuleCheck=True):
valid = 1
if type(formula) == str:
formula = FormulaFromString(formula)
if NitrogenRuleCheck:
checker = cdk.formula.rules.NitrogenRule()
valid *= checker.validate(formula)
if RDBERuleCheck:
checker = cdk.formula.rules.RDBERule()
valid *= checker.validate(formula)
if valid > 0:
return True
else:
return False
def generate_valid_formula(mass, window, atom_list, maxDBE, NitrogenRuleCheck=True):
all_formula = generate_formula(mass, window, atom_list)
output = []
for f in all_formula:
DBE = getFormulaDBE(f)
if (DBE < 0) or (DBE > maxDBE):
continue
if NitrogenRuleCheck:
check = check_formula(f, NitrogenRuleCheck=True, RDBERuleCheck=False)
if not check:
continue
output.append(f)
return output
############################### Fingerprint ########################################
def getFingerprint(mol, fp_type="standard", size=1024, depth=6, transform=True):
if fp_type == 'maccs':
nbit = 166
elif fp_type == 'estate':
nbit = 79
elif fp_type == 'pubchem':
nbit = 881
elif fp_type == 'klekota-roth':
nbit = 4860
else:
nbit = size
_fingerprinters = {"standard":cdk.fingerprint.Fingerprinter(size, depth)
, "extended":cdk.fingerprint.ExtendedFingerprinter(size, depth)
, "substructure": cdk.fingerprint.SubstructureFingerprinter()
, "graph":cdk.fingerprint.GraphOnlyFingerprinter(size, depth)
, "maccs":cdk.fingerprint.MACCSFingerprinter()
, "pubchem":cdk.fingerprint.PubchemFingerprinter(cdk.silent.SilentChemObjectBuilder.getInstance())
, "estate":cdk.fingerprint.EStateFingerprinter()
, "hybridization":cdk.fingerprint.HybridizationFingerprinter(size, depth)
, "lingo":cdk.fingerprint.LingoFingerprinter(depth)
, "klekota-roth":cdk.fingerprint.KlekotaRothFingerprinter()
, "shortestpath":cdk.fingerprint.ShortestPathFingerprinter(size)
, "signature": cdk.fingerprint.SignatureFingerprinter(depth)
, "circular": cdk.fingerprint.CircularFingerprinter()
}
if fp_type in _fingerprinters:
fingerprinter = _fingerprinters[fp_type]
else:
raise IOError('invalid fingerprint type')
fp = fingerprinter.getBitFingerprint(mol)
if transform:
fp = fp.asBitSet()
bits = []
idx = fp.nextSetBit(0)
while idx >= 0:
bits.append(idx)
idx = fp.nextSetBit(idx + 1)
return {'nbit': nbit, 'bits':bits}
else:
return fp
def TanimotoSimilarity(fingerprint_1, fingerprint_2):
similarity = cdk.similarity.Tanimoto.calculate(fingerprint_1, fingerprint_2)
return similarity
################################# Fragmenter #########################################
def generateFragments(mol, method='MurckoFragmenter', minFragSize=6, singleFrameworkOnly=True, asSmiles=True):
if method == 'MurckoFragmenter':
function = cdk.fragment.MurckoFragmenter(singleFrameworkOnly, minFragSize)
elif method == 'ExhaustiveFragmenter':
function = cdk.fragment.ExhaustiveFragmenter(minFragSize)
else:
raise IOError('Invalid fragmentation method')
function.generateFragments(mol)
if asSmiles:
fragments = function.getFragments()
else:
fragments = function.getFragmentsAsContainers()
return np.array(fragments)
################################# Descriptor #########################################
def getMolecularDescriptorCategories():
function = cdk.qsar.DescriptorEngine(cdk.qsar.IMolecularDescriptor, cdk.silent.SilentChemObjectBuilder.getInstance())
return list(function.getAvailableDictionaryClasses())
def getMolecularDescriptor(mol, species='all'):
function = cdk.qsar.DescriptorEngine(cdk.qsar.IMolecularDescriptor, cdk.silent.SilentChemObjectBuilder.getInstance())
descriptors = list(function.getDescriptorInstances())
specifications = list(function.getDescriptorSpecifications())
categories = []
for s in specifications:
try:
t = list(function.getDictionaryClass(s))
except:
t = ['constitutionalDescriptor']
categories.append(t)
Descriptors = {}
keys = ['Fsp3', 'nSmallRings', 'tpsaEfficiency', 'Zagreb', 'XLogP', 'WPATH', 'Wlambda1.unity', 'WTPT-1', 'MW', 'VAdjMat', 'VABC', 'TopoPSA', 'LipinskiFailures', 'nRotB', 'topoShape', 'PetitjeanNumber', 'MOMI-X', 'MDEC-11', 'MLogP', 'nAtomLAC', 'LOBMAX', 'nAtomP', 'nAtomLC', 'khs.sLi', 'Kier1', 'HybRatio', 'nHBDon', 'nHBAcc', 'GRAV-1', 'fragC', 'FMF', 'ECCEN', 'PPSA-1', 'SP-0', 'SPC-4', 'SC-3', 'SCH-3', 'C1SP1', 'bpol', 'nB', 'BCUTw-1l', 'nBase', 'ATSp1', 'ATSm1', 'ATSc1', 'nAtom', 'nAromBond', 'naAromAtom', 'apol', 'ALogP', 'nAcid']
lens = [1,11,1,1,1,2,17,5,1,1,1,1,1,1,2,1,7,19,1,1,2,1,1,79,3,1,1,1,9,1,1,1,29,16,6,8,10,9,1,1,6,1,5,5,5,1,1,1,1,3,1]
if species == 'all':
species = set(getMolecularDescriptorCategories())
for i, descriptor in enumerate(descriptors):
if set(categories[i]).intersection(species) == 0:
continue
name = list(descriptor.getDescriptorNames())[0]
try:
value = descriptor.calculate(mol).getValue().toString()
value = value.split(',')
value = [float(v) for v in value]
except:
value = np.repeat(np.nan, lens[keys.index(name)])
Descriptors[name] = value
return Descriptors