Monomerizer / src /analyse.py
Playingyoyo's picture
Initial Monomerizer Space
084b58f
#!/usr/bin/env python3
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors, DataStructs, Descriptors
import os, sys, requests, tqdm, re, argparse
from collections import defaultdict
import xml.etree.ElementTree as ET
def add_canonical_smiles(df):
canonical_smiles_list = [
"C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N", # Trytophan (W)
"C(C[C@@H](C(=O)O)N)CNC(=N)N", # Arginine (R)
"C1=C(NC=N1)C[C@@H](C(=O)O)N", # Histidine (H)
"C1C[C@H](NC1)C(=O)O", # Proline (P)
"C(CCN)C[C@@H](C(=O)O)N", # Lysine (K)
"CSCC[C@@H](C(=O)O)N", # Methionine (M)
"C(CC(=O)N)[C@@H](C(=O)O)N", # Asparagine (N)
"C([C@@H](C(=O)O)N)C(=O)N", # Glutamine (Q)
"C(CC(=O)O)[C@@H](C(=O)O)N", # Glutamic acid (E)
"OC(=O)C[C@@H](C(=O)O)N", # Aspartic acid (D)
"C1=CC(=CC=C1C[C@@H](C(=O)O)N)O", # Tyrosine (Y)
"C1=CC=C(C=C1)C[C@@H](C(=O)O)N", # Phenylalanine (F)
"CC[C@H](C)[C@@H](C(=O)O)N", # Valine (V)
"CC(C)C[C@@H](C(=O)O)N", # Leucine (L)
"CC(C)[C@@H](C(=O)O)N", # Isoleucine (I)
"C[C@H]([C@@H](C(=O)O)N)O", # Threonine (T)
"C([C@@H](C(=O)O)N)S", # Cysteine (C)
"C([C@@H](C(=O)O)N)O", # Serine (S)
"C[C@@H](C(=O)O)N", # Alanine (A)
"C(C(=O)O)N" # Glycine (G)
]
one_letter_codes = ['W','R','H','P','K','M','N','Q','E','D','Y','F','V','L','I','T','C','S','A','G']
canonical_df = pd.DataFrame({
'ID': one_letter_codes,
'SMILES': canonical_smiles_list,
'CANONICAL': ['True'] * len(canonical_smiles_list),
'TERMINAL': ['NotTer'] * len(canonical_smiles_list),
'ROMol': [Chem.MolFromSmiles(smi) for smi in canonical_smiles_list]
})
return pd.concat([df, canonical_df], ignore_index=True)
def cal_tanimoto(mol):
l_glycine = Chem.MolFromSmiles("C(C(=O)O)N")
fp1 = rdMolDescriptors.GetMorganFingerprint(mol, 2)
fp2 = rdMolDescriptors.GetMorganFingerprint(l_glycine, 2)
return DataStructs.TanimotoSimilarity(fp1, fp2)
def fetch_pubchem_name(smiles):
try:
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/property/Title/JSON"
response = requests.get(url)
response.raise_for_status()
data = response.json()
return data['PropertyTable']['Properties'][0].get('Title', 'NULL')
except (requests.exceptions.RequestException, KeyError, IndexError):
return "NULL"
def fetch_chembl_similarity(smiles, similarity_threshold=100):
try:
url = f"https://www.ebi.ac.uk/chembl/api/data/similarity/{smiles}/{similarity_threshold}"
response = requests.get(url)
response.raise_for_status()
root = ET.fromstring(response.content)
chembl_ids = [m.find('.//molecule_chembl_id').text for m in root.findall('.//molecule') if m.find('.//molecule_chembl_id') is not None]
return chembl_ids if chembl_ids else ["NULL"]
except requests.exceptions.RequestException:
return ["NULL"]
def fetch_names(smiles):
pubchem_name = fetch_pubchem_name(smiles)
chembl_names = fetch_chembl_similarity(smiles)
return pubchem_name, ",".join(chembl_names)
def fetch_rdkit_properties(smiles):
try:
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return ["NULL"] * 7
weight = Descriptors.ExactMolWt(mol)
clogp = Descriptors.MolLogP(mol)
tpsa = Descriptors.TPSA(mol)
charge = Chem.GetFormalCharge(mol)
rotatable_bonds = Descriptors.NumRotatableBonds(mol)
h_donors = Descriptors.NumHDonors(mol)
h_acceptors = Descriptors.NumHAcceptors(mol)
return [weight, clogp, tpsa, charge, rotatable_bonds, h_donors, h_acceptors]
except Exception:
return ["NULL"] * 7
def count_monomers(mols_df):
monomers_dict = defaultdict(int)
for sequence in mols_df['SEQUENCE']:
if isinstance(sequence, str) and len(sequence) > 0:
tokens = re.findall('[A-Z][^A-Z]*', sequence)
for token in tokens:
monomers_dict[token] += 1
return monomers_dict
def main():
parser = argparse.ArgumentParser(description='Analyse non-natural amino acids (NNAA) from PubChem.')
parser.add_argument('--input_dir', help='Input directory containing the monomer data.', default='data/tmp')
parser.add_argument('--mols_file', help='File name relative to input_dir.', default='standard/sequences_standardized.txt')
parser.add_argument('-fetch_names', help='Fetch names from PubChem and ChEMBL.', action='store_true')
parser.add_argument('--target_type', help='Type of target: ncAAs or peptides?', default='ncAAs')
parser.add_argument('--output_file', help='Output CSV file name.', default='analysis.csv')
args = parser.parse_args()
mols_path = args.mols_file
output_path = os.path.join(args.input_dir, args.output_file)
df = pd.read_csv(mols_path, sep='\t')
df = df.dropna(subset=['SMILES']).drop_duplicates(subset=['SMILES'])
df['ROMol'] = df['SMILES'].apply(Chem.MolFromSmiles)
if args.fetch_names:
df[['PUBCHEM_NAME', 'CHEMBL_NAMES']] = df['SMILES'].apply(fetch_names).tolist()
df['Tanimoto_to_Glycine'] = df['ROMol'].apply(cal_tanimoto)
df[['MolWt', 'LogP', 'TPSA', 'FormalCharge', 'RotatableBonds', 'HydrogenDonors', 'HydrogenAcceptors']] = df['SMILES'].apply(fetch_rdkit_properties).tolist()
df.to_csv(output_path, index=False)
print(f"Processing completed. Results saved to {output_path}")
if __name__ == "__main__":
main()