Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| import pandas as pd | |
| from rdkit import Chem | |
| from rdkit.Chem import rdMolDescriptors, DataStructs, Descriptors | |
| import os, sys, requests, tqdm, re, argparse | |
| from collections import defaultdict | |
| import xml.etree.ElementTree as ET | |
| def add_canonical_smiles(df): | |
| canonical_smiles_list = [ | |
| "C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N", # Trytophan (W) | |
| "C(C[C@@H](C(=O)O)N)CNC(=N)N", # Arginine (R) | |
| "C1=C(NC=N1)C[C@@H](C(=O)O)N", # Histidine (H) | |
| "C1C[C@H](NC1)C(=O)O", # Proline (P) | |
| "C(CCN)C[C@@H](C(=O)O)N", # Lysine (K) | |
| "CSCC[C@@H](C(=O)O)N", # Methionine (M) | |
| "C(CC(=O)N)[C@@H](C(=O)O)N", # Asparagine (N) | |
| "C([C@@H](C(=O)O)N)C(=O)N", # Glutamine (Q) | |
| "C(CC(=O)O)[C@@H](C(=O)O)N", # Glutamic acid (E) | |
| "OC(=O)C[C@@H](C(=O)O)N", # Aspartic acid (D) | |
| "C1=CC(=CC=C1C[C@@H](C(=O)O)N)O", # Tyrosine (Y) | |
| "C1=CC=C(C=C1)C[C@@H](C(=O)O)N", # Phenylalanine (F) | |
| "CC[C@H](C)[C@@H](C(=O)O)N", # Valine (V) | |
| "CC(C)C[C@@H](C(=O)O)N", # Leucine (L) | |
| "CC(C)[C@@H](C(=O)O)N", # Isoleucine (I) | |
| "C[C@H]([C@@H](C(=O)O)N)O", # Threonine (T) | |
| "C([C@@H](C(=O)O)N)S", # Cysteine (C) | |
| "C([C@@H](C(=O)O)N)O", # Serine (S) | |
| "C[C@@H](C(=O)O)N", # Alanine (A) | |
| "C(C(=O)O)N" # Glycine (G) | |
| ] | |
| one_letter_codes = ['W','R','H','P','K','M','N','Q','E','D','Y','F','V','L','I','T','C','S','A','G'] | |
| canonical_df = pd.DataFrame({ | |
| 'ID': one_letter_codes, | |
| 'SMILES': canonical_smiles_list, | |
| 'CANONICAL': ['True'] * len(canonical_smiles_list), | |
| 'TERMINAL': ['NotTer'] * len(canonical_smiles_list), | |
| 'ROMol': [Chem.MolFromSmiles(smi) for smi in canonical_smiles_list] | |
| }) | |
| return pd.concat([df, canonical_df], ignore_index=True) | |
| def cal_tanimoto(mol): | |
| l_glycine = Chem.MolFromSmiles("C(C(=O)O)N") | |
| fp1 = rdMolDescriptors.GetMorganFingerprint(mol, 2) | |
| fp2 = rdMolDescriptors.GetMorganFingerprint(l_glycine, 2) | |
| return DataStructs.TanimotoSimilarity(fp1, fp2) | |
| def fetch_pubchem_name(smiles): | |
| try: | |
| url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/property/Title/JSON" | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| data = response.json() | |
| return data['PropertyTable']['Properties'][0].get('Title', 'NULL') | |
| except (requests.exceptions.RequestException, KeyError, IndexError): | |
| return "NULL" | |
| def fetch_chembl_similarity(smiles, similarity_threshold=100): | |
| try: | |
| url = f"https://www.ebi.ac.uk/chembl/api/data/similarity/{smiles}/{similarity_threshold}" | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| root = ET.fromstring(response.content) | |
| chembl_ids = [m.find('.//molecule_chembl_id').text for m in root.findall('.//molecule') if m.find('.//molecule_chembl_id') is not None] | |
| return chembl_ids if chembl_ids else ["NULL"] | |
| except requests.exceptions.RequestException: | |
| return ["NULL"] | |
| def fetch_names(smiles): | |
| pubchem_name = fetch_pubchem_name(smiles) | |
| chembl_names = fetch_chembl_similarity(smiles) | |
| return pubchem_name, ",".join(chembl_names) | |
| def fetch_rdkit_properties(smiles): | |
| try: | |
| mol = Chem.MolFromSmiles(smiles) | |
| if mol is None: | |
| return ["NULL"] * 7 | |
| weight = Descriptors.ExactMolWt(mol) | |
| clogp = Descriptors.MolLogP(mol) | |
| tpsa = Descriptors.TPSA(mol) | |
| charge = Chem.GetFormalCharge(mol) | |
| rotatable_bonds = Descriptors.NumRotatableBonds(mol) | |
| h_donors = Descriptors.NumHDonors(mol) | |
| h_acceptors = Descriptors.NumHAcceptors(mol) | |
| return [weight, clogp, tpsa, charge, rotatable_bonds, h_donors, h_acceptors] | |
| except Exception: | |
| return ["NULL"] * 7 | |
| def count_monomers(mols_df): | |
| monomers_dict = defaultdict(int) | |
| for sequence in mols_df['SEQUENCE']: | |
| if isinstance(sequence, str) and len(sequence) > 0: | |
| tokens = re.findall('[A-Z][^A-Z]*', sequence) | |
| for token in tokens: | |
| monomers_dict[token] += 1 | |
| return monomers_dict | |
| def main(): | |
| parser = argparse.ArgumentParser(description='Analyse non-natural amino acids (NNAA) from PubChem.') | |
| parser.add_argument('--input_dir', help='Input directory containing the monomer data.', default='data/tmp') | |
| parser.add_argument('--mols_file', help='File name relative to input_dir.', default='standard/sequences_standardized.txt') | |
| parser.add_argument('-fetch_names', help='Fetch names from PubChem and ChEMBL.', action='store_true') | |
| parser.add_argument('--target_type', help='Type of target: ncAAs or peptides?', default='ncAAs') | |
| parser.add_argument('--output_file', help='Output CSV file name.', default='analysis.csv') | |
| args = parser.parse_args() | |
| mols_path = args.mols_file | |
| output_path = os.path.join(args.input_dir, args.output_file) | |
| df = pd.read_csv(mols_path, sep='\t') | |
| df = df.dropna(subset=['SMILES']).drop_duplicates(subset=['SMILES']) | |
| df['ROMol'] = df['SMILES'].apply(Chem.MolFromSmiles) | |
| if args.fetch_names: | |
| df[['PUBCHEM_NAME', 'CHEMBL_NAMES']] = df['SMILES'].apply(fetch_names).tolist() | |
| df['Tanimoto_to_Glycine'] = df['ROMol'].apply(cal_tanimoto) | |
| df[['MolWt', 'LogP', 'TPSA', 'FormalCharge', 'RotatableBonds', 'HydrogenDonors', 'HydrogenAcceptors']] = df['SMILES'].apply(fetch_rdkit_properties).tolist() | |
| df.to_csv(output_path, index=False) | |
| print(f"Processing completed. Results saved to {output_path}") | |
| if __name__ == "__main__": | |
| main() | |