{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "b6059109", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 4, "id": "38498646", "metadata": { "scrolled": true }, "outputs": [], "source": [ "df = pd.read_excel(\"./EC_antibiotic.xlsx\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "c15ce735", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COADD_IDINHIB_AVESMILESHIT
0CO-ADD:030375393.09N1(c2cccc(OC)c2)C(=O)NC(\\C(=C/c(ccc3[N+](=O)[O...1.0
1CO-ADD:0307303100.26C(C(O)=O)(=CN(CC)c1c2cc(c(N(CC3)CCN3C(=S)NC(=O...1.0
2CO-ADD:0240410100.00C(C#N)(C1c2ccc(Br)s2)=C(N)N(C(CCCC3=O)=C13)c4c...1.0
3CO-ADD:024261756.74c(CN(CC1)CCN1C)(cc(Br)c(c23)cccn2)c3O1.0
4CO-ADD:0237159100.66C[C@@H]1[C@H]2[C@H](O)[C@H]3[C@H](N(C)C)C(=C(C...1.0
...............
82753CO-ADD:0254726NaNN12C(SC(C(C)C)=N1)=NC(O)=CC2=ONaN
82754CO-ADD:0255128NaNS(=O)(=O)(c1ccc(cc1)C)N(Cc2ccccc2)c3ccccc3C(=O...NaN
82755CO-ADD:0254376NaNN1(c2ccccc2)\\C(=N/c3ccccc3)\\S\\C(=C/c4c(cccc4Cl...NaN
82756CO-ADD:0252344NaNC12(c3c(cccc3)C(=NN1c4ccccc4)c5ccc(cc5)Cl)SC(=...NaN
82757CO-ADD:0252267NaNc1(C#N)c(CCC(C2)C(C)(C)C)c2sc1\\N=C\\c(cccn3)c3NaN
\n", "

82758 rows × 4 columns

\n", "
" ], "text/plain": [ " COADD_ID INHIB_AVE \\\n", "0 CO-ADD:0303753 93.09 \n", "1 CO-ADD:0307303 100.26 \n", "2 CO-ADD:0240410 100.00 \n", "3 CO-ADD:0242617 56.74 \n", "4 CO-ADD:0237159 100.66 \n", "... ... ... \n", "82753 CO-ADD:0254726 NaN \n", "82754 CO-ADD:0255128 NaN \n", "82755 CO-ADD:0254376 NaN \n", "82756 CO-ADD:0252344 NaN \n", "82757 CO-ADD:0252267 NaN \n", "\n", " SMILES HIT \n", "0 N1(c2cccc(OC)c2)C(=O)NC(\\C(=C/c(ccc3[N+](=O)[O... 1.0 \n", "1 C(C(O)=O)(=CN(CC)c1c2cc(c(N(CC3)CCN3C(=S)NC(=O... 1.0 \n", "2 C(C#N)(C1c2ccc(Br)s2)=C(N)N(C(CCCC3=O)=C13)c4c... 1.0 \n", "3 c(CN(CC1)CCN1C)(cc(Br)c(c23)cccn2)c3O 1.0 \n", "4 C[C@@H]1[C@H]2[C@H](O)[C@H]3[C@H](N(C)C)C(=C(C... 1.0 \n", "... ... ... \n", "82753 N12C(SC(C(C)C)=N1)=NC(O)=CC2=O NaN \n", "82754 S(=O)(=O)(c1ccc(cc1)C)N(Cc2ccccc2)c3ccccc3C(=O... NaN \n", "82755 N1(c2ccccc2)\\C(=N/c3ccccc3)\\S\\C(=C/c4c(cccc4Cl... NaN \n", "82756 C12(c3c(cccc3)C(=NN1c4ccccc4)c5ccc(cc5)Cl)SC(=... NaN \n", "82757 c1(C#N)c(CCC(C2)C(C)(C)C)c2sc1\\N=C\\c(cccn3)c3 NaN \n", "\n", "[82758 rows x 4 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 9, "id": "b18f66b6", "metadata": {}, "outputs": [], "source": [ "df_new = df.drop_duplicates('SMILES')" ] }, { "cell_type": "code", "execution_count": 12, "id": "294e3364", "metadata": {}, "outputs": [], "source": [ "df_save = df_new.fillna(0)" ] }, { "cell_type": "code", "execution_count": 15, "id": "cf2602d6", "metadata": {}, "outputs": [], "source": [ "df_save.to_csv(\"./EC_antibiotic_correctinfo.csv\")" ] }, { "cell_type": "code", "execution_count": 16, "id": "10f2eb54", "metadata": {}, "outputs": [], "source": [ "questions = [\n", " \"Does the proposed molecule {molecule} exhibit antibacterial activity against Escherichia coli?\",\n", " \"Is {molecule} capable of inhibiting the growth or survival of E. coli?\",\n", " \"Can {molecule} effectively kill or suppress Escherichia coli cells?\",\n", " \"Does {molecule} demonstrate bactericidal effects against E. coli in vitro?\",\n", " \"Is Escherichia coli susceptible to treatment with the compound {molecule}?\",\n", " \"Can exposure to {molecule} lead to reduced viability of E. coli?\",\n", " \"Does {molecule} show antimicrobial efficacy specifically against Escherichia coli strains?\",\n", " \"Is {molecule} effective in eliminating E. coli under experimental conditions?\",\n", " \"Can {molecule} act as an antibacterial agent targeting Escherichia coli?\",\n", " \"Does treatment with {molecule} result in bacteriostatic or bactericidal effects on E. coli?\"\n", "]" ] }, { "cell_type": "code", "execution_count": 18, "id": "be4cc8f5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COADD_IDINHIB_AVESMILESHIT
0CO-ADD:030375393.09N1(c2cccc(OC)c2)C(=O)NC(\\C(=C/c(ccc3[N+](=O)[O...1.0
1CO-ADD:0307303100.26C(C(O)=O)(=CN(CC)c1c2cc(c(N(CC3)CCN3C(=S)NC(=O...1.0
2CO-ADD:0240410100.00C(C#N)(C1c2ccc(Br)s2)=C(N)N(C(CCCC3=O)=C13)c4c...1.0
3CO-ADD:024261756.74c(CN(CC1)CCN1C)(cc(Br)c(c23)cccn2)c3O1.0
4CO-ADD:0237159100.66C[C@@H]1[C@H]2[C@H](O)[C@H]3[C@H](N(C)C)C(=C(C...1.0
...............
82753CO-ADD:02547260.00N12C(SC(C(C)C)=N1)=NC(O)=CC2=O0.0
82754CO-ADD:02551280.00S(=O)(=O)(c1ccc(cc1)C)N(Cc2ccccc2)c3ccccc3C(=O...0.0
82755CO-ADD:02543760.00N1(c2ccccc2)\\C(=N/c3ccccc3)\\S\\C(=C/c4c(cccc4Cl...0.0
82756CO-ADD:02523440.00C12(c3c(cccc3)C(=NN1c4ccccc4)c5ccc(cc5)Cl)SC(=...0.0
82757CO-ADD:02522670.00c1(C#N)c(CCC(C2)C(C)(C)C)c2sc1\\N=C\\c(cccn3)c30.0
\n", "

82080 rows × 4 columns

\n", "
" ], "text/plain": [ " COADD_ID INHIB_AVE \\\n", "0 CO-ADD:0303753 93.09 \n", "1 CO-ADD:0307303 100.26 \n", "2 CO-ADD:0240410 100.00 \n", "3 CO-ADD:0242617 56.74 \n", "4 CO-ADD:0237159 100.66 \n", "... ... ... \n", "82753 CO-ADD:0254726 0.00 \n", "82754 CO-ADD:0255128 0.00 \n", "82755 CO-ADD:0254376 0.00 \n", "82756 CO-ADD:0252344 0.00 \n", "82757 CO-ADD:0252267 0.00 \n", "\n", " SMILES HIT \n", "0 N1(c2cccc(OC)c2)C(=O)NC(\\C(=C/c(ccc3[N+](=O)[O... 1.0 \n", "1 C(C(O)=O)(=CN(CC)c1c2cc(c(N(CC3)CCN3C(=S)NC(=O... 1.0 \n", "2 C(C#N)(C1c2ccc(Br)s2)=C(N)N(C(CCCC3=O)=C13)c4c... 1.0 \n", "3 c(CN(CC1)CCN1C)(cc(Br)c(c23)cccn2)c3O 1.0 \n", "4 C[C@@H]1[C@H]2[C@H](O)[C@H]3[C@H](N(C)C)C(=C(C... 1.0 \n", "... ... ... \n", "82753 N12C(SC(C(C)C)=N1)=NC(O)=CC2=O 0.0 \n", "82754 S(=O)(=O)(c1ccc(cc1)C)N(Cc2ccccc2)c3ccccc3C(=O... 0.0 \n", "82755 N1(c2ccccc2)\\C(=N/c3ccccc3)\\S\\C(=C/c4c(cccc4Cl... 0.0 \n", "82756 C12(c3c(cccc3)C(=NN1c4ccccc4)c5ccc(cc5)Cl)SC(=... 0.0 \n", "82757 c1(C#N)c(CCC(C2)C(C)(C)C)c2sc1\\N=C\\c(cccn3)c3 0.0 \n", "\n", "[82080 rows x 4 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_save" ] }, { "cell_type": "code", "execution_count": 23, "id": "e9698345", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "np.random.seed(2024)" ] }, { "cell_type": "code", "execution_count": 24, "id": "a298e333", "metadata": {}, "outputs": [], "source": [ "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 25, "id": "e545da5a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 82080/82080 [00:04<00:00, 18390.14it/s]\n" ] } ], "source": [ "question = []\n", "answer = []\n", "for item in tqdm(df_save.index):\n", " smile = df_save.loc[item]['SMILES']\n", " ques = np.random.choice(questions)\n", " ques = ques.format(molecule = smile)\n", " question.append(ques)\n", " \n", " if df_save.loc[item]['HIT'] ==1:\n", " answer.append('Yes')\n", " else:\n", " answer.append('No')" ] }, { "cell_type": "code", "execution_count": 26, "id": "70ded6ff", "metadata": {}, "outputs": [], "source": [ "df_save['question'] = question\n", "df_save['answer'] = answer" ] }, { "cell_type": "code", "execution_count": 27, "id": "464476d6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COADD_IDINHIB_AVESMILESHITquestionanswer
0CO-ADD:030375393.09N1(c2cccc(OC)c2)C(=O)NC(\\C(=C/c(ccc3[N+](=O)[O...1.0Can N1(c2cccc(OC)c2)C(=O)NC(\\C(=C/c(ccc3[N+](=...Yes
1CO-ADD:0307303100.26C(C(O)=O)(=CN(CC)c1c2cc(c(N(CC3)CCN3C(=S)NC(=O...1.0Does the proposed molecule C(C(O)=O)(=CN(CC)c1...Yes
2CO-ADD:0240410100.00C(C#N)(C1c2ccc(Br)s2)=C(N)N(C(CCCC3=O)=C13)c4c...1.0Does the proposed molecule C(C#N)(C1c2ccc(Br)s...Yes
3CO-ADD:024261756.74c(CN(CC1)CCN1C)(cc(Br)c(c23)cccn2)c3O1.0Is Escherichia coli susceptible to treatment w...Yes
4CO-ADD:0237159100.66C[C@@H]1[C@H]2[C@H](O)[C@H]3[C@H](N(C)C)C(=C(C...1.0Is C[C@@H]1[C@H]2[C@H](O)[C@H]3[C@H](N(C)C)C(=...Yes
.....................
82753CO-ADD:02547260.00N12C(SC(C(C)C)=N1)=NC(O)=CC2=O0.0Is N12C(SC(C(C)C)=N1)=NC(O)=CC2=O capable of i...No
82754CO-ADD:02551280.00S(=O)(=O)(c1ccc(cc1)C)N(Cc2ccccc2)c3ccccc3C(=O...0.0Does the proposed molecule S(=O)(=O)(c1ccc(cc1...No
82755CO-ADD:02543760.00N1(c2ccccc2)\\C(=N/c3ccccc3)\\S\\C(=C/c4c(cccc4Cl...0.0Can N1(c2ccccc2)\\C(=N/c3ccccc3)\\S\\C(=C/c4c(ccc...No
82756CO-ADD:02523440.00C12(c3c(cccc3)C(=NN1c4ccccc4)c5ccc(cc5)Cl)SC(=...0.0Can C12(c3c(cccc3)C(=NN1c4ccccc4)c5ccc(cc5)Cl)...No
82757CO-ADD:02522670.00c1(C#N)c(CCC(C2)C(C)(C)C)c2sc1\\N=C\\c(cccn3)c30.0Does the proposed molecule c1(C#N)c(CCC(C2)C(C...No
\n", "

82080 rows × 6 columns

\n", "
" ], "text/plain": [ " COADD_ID INHIB_AVE \\\n", "0 CO-ADD:0303753 93.09 \n", "1 CO-ADD:0307303 100.26 \n", "2 CO-ADD:0240410 100.00 \n", "3 CO-ADD:0242617 56.74 \n", "4 CO-ADD:0237159 100.66 \n", "... ... ... \n", "82753 CO-ADD:0254726 0.00 \n", "82754 CO-ADD:0255128 0.00 \n", "82755 CO-ADD:0254376 0.00 \n", "82756 CO-ADD:0252344 0.00 \n", "82757 CO-ADD:0252267 0.00 \n", "\n", " SMILES HIT \\\n", "0 N1(c2cccc(OC)c2)C(=O)NC(\\C(=C/c(ccc3[N+](=O)[O... 1.0 \n", "1 C(C(O)=O)(=CN(CC)c1c2cc(c(N(CC3)CCN3C(=S)NC(=O... 1.0 \n", "2 C(C#N)(C1c2ccc(Br)s2)=C(N)N(C(CCCC3=O)=C13)c4c... 1.0 \n", "3 c(CN(CC1)CCN1C)(cc(Br)c(c23)cccn2)c3O 1.0 \n", "4 C[C@@H]1[C@H]2[C@H](O)[C@H]3[C@H](N(C)C)C(=C(C... 1.0 \n", "... ... ... \n", "82753 N12C(SC(C(C)C)=N1)=NC(O)=CC2=O 0.0 \n", "82754 S(=O)(=O)(c1ccc(cc1)C)N(Cc2ccccc2)c3ccccc3C(=O... 0.0 \n", "82755 N1(c2ccccc2)\\C(=N/c3ccccc3)\\S\\C(=C/c4c(cccc4Cl... 0.0 \n", "82756 C12(c3c(cccc3)C(=NN1c4ccccc4)c5ccc(cc5)Cl)SC(=... 0.0 \n", "82757 c1(C#N)c(CCC(C2)C(C)(C)C)c2sc1\\N=C\\c(cccn3)c3 0.0 \n", "\n", " question answer \n", "0 Can N1(c2cccc(OC)c2)C(=O)NC(\\C(=C/c(ccc3[N+](=... Yes \n", "1 Does the proposed molecule C(C(O)=O)(=CN(CC)c1... Yes \n", "2 Does the proposed molecule C(C#N)(C1c2ccc(Br)s... Yes \n", "3 Is Escherichia coli susceptible to treatment w... Yes \n", "4 Is C[C@@H]1[C@H]2[C@H](O)[C@H]3[C@H](N(C)C)C(=... Yes \n", "... ... ... \n", "82753 Is N12C(SC(C(C)C)=N1)=NC(O)=CC2=O capable of i... No \n", "82754 Does the proposed molecule S(=O)(=O)(c1ccc(cc1... No \n", "82755 Can N1(c2ccccc2)\\C(=N/c3ccccc3)\\S\\C(=C/c4c(ccc... No \n", "82756 Can C12(c3c(cccc3)C(=NN1c4ccccc4)c5ccc(cc5)Cl)... No \n", "82757 Does the proposed molecule c1(C#N)c(CCC(C2)C(C... No \n", "\n", "[82080 rows x 6 columns]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_save" ] }, { "cell_type": "code", "execution_count": 31, "id": "63729449", "metadata": {}, "outputs": [], "source": [ "df_test = df_save.sample(n=3000,random_state=2024)\n", "df_test.to_csv(\"./ec_test\")" ] }, { "cell_type": "code", "execution_count": 35, "id": "53656d4a", "metadata": {}, "outputs": [], "source": [ "df_save.loc[~df_save.index.isin(df_test.index)].to_csv(\"./ec_train\")" ] }, { "cell_type": "code", "execution_count": 36, "id": "a2dc37f7", "metadata": {}, "outputs": [], "source": [ "df_test.to_json(\n", " './ec_test.json',\n", " orient='records',\n", " indent=2)\n", "df_save.loc[~df_save.index.isin(df_test.index)].to_json(\n", " './ec_train.json',\n", " orient='records',\n", " indent=2)" ] }, { "cell_type": "code", "execution_count": 38, "id": "6a17842a", "metadata": {}, "outputs": [], "source": [ "df_test['_id'] = list(df_test.index)\n", "df_test.to_json(\n", " './ec_test.json',\n", " orient='records',\n", " indent=2)" ] }, { "cell_type": "code", "execution_count": 39, "id": "e0a2b337", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_3386857/3483380234.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_train['_id'] = list(df_train.index)\n" ] } ], "source": [ "df_train = df_save.loc[~df_save.index.isin(df_test.index)]\n", "df_train['_id'] = list(df_train.index)\n", "df_train.to_json(\n", " './ec_train.json',\n", " orient='records',\n", " indent=2)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 5 }