{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Manipulating Molecules and Proteins within OpenBioMed\n", "\n", "OpenBioMed develops specific data structures for handling molecules, pockets, proteins and other data modalities in chemistry and life science. We provide easy-to-use APIs for input loading, data manipulation and format transformation." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/AIRvePFS/dair/luoyz-data/projects/OpenBioMed/OpenBioMed_arch\n" ] } ], "source": [ "# Change working directory\n", "import os\n", "import sys\n", "parent = os.path.dirname(os.path.abspath(''))\n", "print(parent)\n", "sys.path.append(parent)\n", "os.chdir(parent)\n", "\n", "import logging\n", "logging.basicConfig(level=logging.ERROR)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we present several basic usage of the `Molecule` data structure." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "43 48\n", "[[ 7.15979818 0.80874434 -1.1777022 ]\n", " [ 6.03656999 0.98159063 -0.25523318]\n", " [ 6.5018441 1.18971222 1.11943242]\n", " [ 6.66523363 -0.20081618 1.6896087 ]\n", " [ 5.54799761 -1.00059728 1.0349241 ]\n", " [ 5.15551891 -0.20738398 -0.21709433]\n", " [ 3.67186942 0.23835869 -0.15975325]\n", " [ 2.79871947 -0.89883247 -0.09910995]\n", " [ 1.46873112 -0.51443636 -0.1461924 ]\n", " [ 1.15329933 0.75341864 0.16184495]\n", " [-0.15212908 1.07702788 0.11271499]\n", " [-1.13060623 0.13843592 -0.18651713]\n", " [-2.56195997 0.60224809 -0.26848812]\n", " [-2.61589085 1.98880147 -0.91233925]\n", " [-1.7825011 2.96467228 -0.19616682]\n", " [-0.53343712 2.50816882 0.45025218]\n", " [-2.50230208 3.96691016 0.47985061]\n", " [-3.13052426 3.64024452 1.69501868]\n", " [-3.93447426 4.55568201 2.36219254]\n", " [-4.12855294 5.818542 1.8206921 ]\n", " [-3.51034937 6.19212391 0.61558734]\n", " [-2.67001728 5.2823797 -0.0931741 ]\n", " [-2.08308779 5.79607309 -1.29704122]\n", " [-2.33174422 7.10892277 -1.74528179]\n", " [-3.17644012 7.95082858 -1.04553662]\n", " [-3.7554685 7.4915732 0.1285071 ]\n", " [-0.99163735 4.89717253 -2.30902942]\n", " [-0.70191467 -1.16676402 -0.45029655]\n", " [ 0.60055216 -1.49510271 -0.43956623]\n", " [-1.64679714 -2.12915293 -0.80618072]\n", " [-2.54437017 -2.56353793 0.27559944]\n", " [-2.08539457 -3.88347299 0.90232423]\n", " [-1.93167109 -4.97282649 -0.0844877 ]\n", " [-1.06227749 -4.61380138 -1.24475574]\n", " [-1.29295189 -3.15979488 -1.78961318]\n", " [-1.27938582 -5.56258492 -2.45360204]\n", " [-0.2174369 -5.40707465 -3.46087308]\n", " [ 0.63034382 -5.28808772 -4.24613934]\n", " [-1.77542539 -6.27450847 0.44844653]\n", " [-0.96085988 -7.08578706 0.01928373]\n", " [-2.64270006 -6.63396037 1.60393026]\n", " [-2.26090304 -7.45610279 2.59209198]\n", " [-3.86976173 -6.05818107 1.64092191]\n", " [ 6.79873013 0.55622683 -2.18555309]\n", " [ 7.73818536 1.7383868 -1.26932641]\n", " [ 7.86033797 0.03522178 -0.85261627]\n", " [ 5.73395043 1.73001106 1.67486844]\n", " [ 7.419509 1.78988913 1.19503237]\n", " [ 6.5962572 -0.22311772 2.77960104]\n", " [ 7.64226268 -0.61823704 1.40514846]\n", " [ 5.92360057 -2.00082331 0.76505514]\n", " [ 4.71342704 -1.145578 1.74648416]\n", " [ 5.29425385 -0.84813919 -1.10409702]\n", " [ 3.51786447 0.89570764 0.71191176]\n", " [ 3.44634771 0.8119402 -1.07047723]\n", " [-3.17026893 -0.07509033 -0.88742447]\n", " [-2.98709825 0.62720276 0.74431733]\n", " [-3.66005573 2.31527798 -0.97518097]\n", " [-2.24671867 1.91806773 -1.94550768]\n", " [-0.61223973 2.56657124 1.54657908]\n", " [ 0.28547577 3.17511762 0.14086687]\n", " [-3.01851963 2.64286832 2.11477936]\n", " [-4.4364677 4.27770288 3.28439649]\n", " [-4.78644737 6.51347537 2.34814639]\n", " [-1.86585325 7.48365619 -2.66101006]\n", " [-3.36810664 8.96373699 -1.39865854]\n", " [-4.41311635 8.16879901 0.66790813]\n", " [-3.56237454 -2.66852699 -0.14799078]\n", " [-2.61952358 -1.8139944 1.07031525]\n", " [-2.81165169 -4.1515399 1.68998666]\n", " [-1.10541696 -3.76139814 1.40803808]\n", " [-0.03124457 -4.68098514 -0.87226339]\n", " [-2.14762764 -3.18451022 -2.48573045]\n", " [-0.43654361 -2.83943951 -2.40281023]\n", " [-2.26024722 -5.34812236 -2.90580875]\n", " [-1.32754918 -6.62104859 -2.18439887]\n", " [-1.31476619 -7.99053664 2.5542106 ]\n", " [-2.91587013 -7.61759684 3.43215917]]\n", "./tmp/molecule_with_conformation.sdf\n" ] } ], "source": [ "# Manipulating molecules\n", "from open_biomed.data.molecule import Molecule\n", "\n", "# Initialize a molecule with a SMILES string\n", "molecule = Molecule.from_smiles(\"CN1CCC[C@H]1COC2=NC3=C(CCN(C3)C4=CC=CC5=C4C(=CC=C5)Cl)C(=N2)N6CCN([C@H](C6)CC#N)C(=O)C(=C)F\")\n", "# Add a RDKit molecule object\n", "molecule._add_rdmol()\n", "# Then, by calling molecule.rdmol, you can get or modify information with RDKit\n", "print(molecule.rdmol.GetNumAtoms(), molecule.rdmol.GetNumBonds())\n", "# Add a 3D conformer\n", "molecule._add_conformer(mode='3D')\n", "# You can directly obtain an ndarray of 3D coordinates, which is synchronized in molecule.rdmol\n", "print(molecule.conformer)\n", "# Save the molecule as a .sdf file\n", "print(molecule.save_sdf(\"./tmp/molecule_with_conformation.sdf\"))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C=C(F)C(=O)N1CCN(c2nc(OC[C@@H]3CCCN3C)nc3c2CCN(c2cccc4cccc(Cl)c24)C3)C[C@@H]1CC#N True\n", "C=CC(=O)N1CCN(c2nc(OC[C@@H]3CCCN3C)nc3c(F)c(-c4nc(N)cc(C)c4C(F)(F)F)c(Cl)cc23)[C@@H](C)C1 ./tmp/pubchem_146624881.pkl\n", "0.31693989071038253\n", "C=CC(=O)N1CCN(c2nc(OC[C@@H]3CCCN3C)nc3c2CCN(c2cccc4cccc(Cl)c24)C3)C[C@@H]1CC#N ./tmp/pubchem_134325731.pkl\n", "0.8769230769230769\n", "C=CC(=O)N1CCN(c2nc(OC[C@@H]3CCCN3C)nc3c2CCN(c2cccc4cccc(C)c24)C3)C[C@@H]1CC#N ./tmp/pubchem_134326084.pkl\n", "0.7941176470588235\n", "CN1CCC[C@H]1COc1nc(N2CCNCC2)c2cnc(-c3cccc4cccc(Cl)c34)c(F)c2n1 ./tmp/pubchem_155233433.pkl\n", "0.46153846153846156\n" ] } ], "source": [ "import asyncio\n", "from open_biomed.core.web_request import PubChemRequester, PubChemStructureRequester\n", "from open_biomed.data.molecule import check_identical_molecules, molecule_fingerprint_similarity\n", "\n", "# We provide APIs for obtaining molecules from PubChem via molecule name or ID\n", "requester = PubChemRequester(db_url=\"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{accession}/SDF\")\n", "# Web request is performed via asynchronous calls\n", "pubchem_molecule = (await asyncio.create_task(requester.run(\"Adagrasib\")))[0][0]\n", "# Check if Adagrasib is the same molecule as the one we construct earlier\n", "print(pubchem_molecule, check_identical_molecules(molecule, pubchem_molecule))\n", "\n", "# We also provide APIs for searching struturally similar molecules on PubChem\n", "requester = PubChemStructureRequester()\n", "similar_molecules = (await asyncio.create_task(requester.run(\n", " molecule=pubchem_molecule, # query molecule\n", " threshold=0.8, # structural similarity threshold\n", " max_records=5, # maximum number of molecules\n", ")))\n", "# The first molecule is the requested molecule itself (identical)\n", "for i in range(1, 5):\n", " # Check the SMILES and saved files of similar molecules\n", " print(similar_molecules[0][i], similar_molecules[1][i])\n", " # Let's check the Morgan fingerprint similarity of returned molecule!\n", " # NOTE: the threshold value used in Pubchem is not Morgan fingerprint similarity, so the output scores could be lower than 0.8\n", " print(molecule_fingerprint_similarity(pubchem_molecule, similar_molecules[0][i], fingerprint_type=\"morgan\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we present some basic usage of the `Protein` data structure" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.8164251207729468\n", "MTEYKLVVVGA-GGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNT-KSFEDIHHYREQIKRVKDSEDVPMVLVGNKCD-LPSRTVDTKQAQDLARSYGIPFIETSAKTRQR-VED-AFYTLVREIRQYRLK-KISKEEKTP----GCVKIKK------C-IIM-\n", "MTEYKLVVVGAVG-VGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINN-IKSFEDIHHYREQIKRVKDSEDVPMVLVGNKC-ALPSRTVDTKQAQDLARSYGIPFIETSAKTRQ-GV-DDAFYTLVREIR----KHK---E-K--MSKDG--K-KKKKSKTKCSI--L\n" ] } ], "source": [ "# Manipulating proteins\n", "from open_biomed.data.protein import Protein, protein_sequence_similarity\n", "\n", "# Initalize the protein object with its amino acid sequence\n", "protein1 = Protein.from_fasta(\"MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQRVEDAFYTLVREIRQYRLKKISKEEKTPGCVKIKKCIIM\")\n", "protein2 = Protein.from_fasta(\"MTEYKLVVVGAVGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNIKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCALPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKHKEKMSKDGKKKKKSKTKCSIL\")\n", "# Protein sequence alignment\n", "outputs = protein_sequence_similarity(protein1, protein2)\n", "print(outputs[0])\n", "print(outputs[1])\n", "print(outputs[2])\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKSDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKH\n", "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKSDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKH\n", "./tmp/msa_results_1741148484988/uniref.a3m\n", "./tmp/foldseek_results_1741148485003\n" ] } ], "source": [ "import asyncio\n", "from open_biomed.core.web_request import PDBRequester, MSARequester, FoldSeekRequester\n", "\n", "# We provide APIs for obtaining proteins on PDB, AlphafoldDB and UniProt\n", "requester = PDBRequester()\n", "# Web request is performed via asynchronous calls\n", "protein_with_structure = (await asyncio.create_task(requester.run(\"4EPT\")))[0][0]\n", "print(protein_with_structure)\n", "# If the protein object has 3D structure, you can save it as a pdb file\n", "file = protein_with_structure.save_pdb()\n", "# We also provide APIs for loading a protein from a pdb file\n", "loaded_protein = Protein.from_pdb_file(file)\n", "print(loaded_protein)\n", "\n", "# We also provide APIs for performing MSA search and FoldSeek with web services\n", "requester = MSARequester()\n", "# NOTE: Performing MSA may take a long time and may occasionally fail\n", "outputs = (await asyncio.create_task(requester.run(loaded_protein)))[0][0]\n", "# Check the .a3m file for MSA results!\n", "print(outputs)\n", "\n", "requester = FoldSeekRequester(database=[\"afdb50\"], timeout=60)\n", "# NOTE: Performing FoldSeek may take a long time and may occasionally fail\n", "outputs = (await asyncio.create_task(requester.run(loaded_protein)))[0][0]\n", "# Check the folder for FoldSeek results!\n", "print(outputs)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.7 ('biomed')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "2b5492c31ef84abdc69aadb95e4c210f44c226a5800d1d766b22f7a50017392c" } } }, "nbformat": 4, "nbformat_minor": 2 }