Commit ·
cfa31cd
1
Parent(s): df1b305
Renamed data files and notebooks accordingly
Browse files- .gitignore +5 -0
- data/PROTAC-DB-Scraped.csv +0 -0
- data/PROTAC-DB.csv +0 -0
- data/PROTAC-Pedia.csv +0 -0
- data/poi_uniprot2sequence.pkl +0 -3
- notebooks/data_curation.ipynb +5 -5
- notebooks/protac_degradation_predictor.ipynb +19 -3
.gitignore
CHANGED
|
@@ -158,3 +158,8 @@ cython_debug/
|
|
| 158 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 159 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 160 |
#.idea/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 159 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 160 |
#.idea/
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# Custom files
|
| 164 |
+
|
| 165 |
+
data/uniprot2embedding.h5
|
data/PROTAC-DB-Scraped.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/PROTAC-DB.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/PROTAC-Pedia.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/poi_uniprot2sequence.pkl
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:85cfb7aadaa54b48490faed7f8d791caa38a354c26501cc213508aeb526ed189
|
| 3 |
-
size 220472
|
|
|
|
|
|
|
|
|
|
|
|
notebooks/data_curation.ipynb
CHANGED
|
@@ -116,7 +116,7 @@
|
|
| 116 |
},
|
| 117 |
{
|
| 118 |
"cell_type": "code",
|
| 119 |
-
"execution_count":
|
| 120 |
"metadata": {},
|
| 121 |
"outputs": [
|
| 122 |
{
|
|
@@ -128,7 +128,7 @@
|
|
| 128 |
}
|
| 129 |
],
|
| 130 |
"source": [
|
| 131 |
-
"protacdb_file = os.path.join(data_dir, '
|
| 132 |
"protacdb_url = 'http://cadd.zju.edu.cn/protacdb/statics/binaryDownload/csv/protac/protac.csv'\n",
|
| 133 |
"if os.path.exists(protacdb_file):\n",
|
| 134 |
" protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)\n",
|
|
@@ -147,7 +147,7 @@
|
|
| 147 |
},
|
| 148 |
{
|
| 149 |
"cell_type": "code",
|
| 150 |
-
"execution_count":
|
| 151 |
"metadata": {},
|
| 152 |
"outputs": [
|
| 153 |
{
|
|
@@ -277,7 +277,7 @@
|
|
| 277 |
],
|
| 278 |
"source": [
|
| 279 |
"scraped_protac_df = pd.read_csv(os.path.join(\n",
|
| 280 |
-
" data_dir, '
|
| 281 |
"# Rename columns\n",
|
| 282 |
"old2new = {\n",
|
| 283 |
" \"Assay (Percent degradation)\": \"Percent degradation (%)\",\n",
|
|
@@ -938,7 +938,7 @@
|
|
| 938 |
}
|
| 939 |
],
|
| 940 |
"source": [
|
| 941 |
-
"df_file = os.path.join(data_dir, '
|
| 942 |
"protac_pedia_df = pd.read_csv(df_file)\n",
|
| 943 |
"print(f'protac_pedia_df len: {len(protac_pedia_df)}')\n",
|
| 944 |
"protac_pedia_df.head()"
|
|
|
|
| 116 |
},
|
| 117 |
{
|
| 118 |
"cell_type": "code",
|
| 119 |
+
"execution_count": null,
|
| 120 |
"metadata": {},
|
| 121 |
"outputs": [
|
| 122 |
{
|
|
|
|
| 128 |
}
|
| 129 |
],
|
| 130 |
"source": [
|
| 131 |
+
"protacdb_file = os.path.join(data_dir, 'PROTAC-DB.csv')\n",
|
| 132 |
"protacdb_url = 'http://cadd.zju.edu.cn/protacdb/statics/binaryDownload/csv/protac/protac.csv'\n",
|
| 133 |
"if os.path.exists(protacdb_file):\n",
|
| 134 |
" protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)\n",
|
|
|
|
| 147 |
},
|
| 148 |
{
|
| 149 |
"cell_type": "code",
|
| 150 |
+
"execution_count": null,
|
| 151 |
"metadata": {},
|
| 152 |
"outputs": [
|
| 153 |
{
|
|
|
|
| 277 |
],
|
| 278 |
"source": [
|
| 279 |
"scraped_protac_df = pd.read_csv(os.path.join(\n",
|
| 280 |
+
" data_dir, 'PROTAC-DB-Scraped.csv'))\n",
|
| 281 |
"# Rename columns\n",
|
| 282 |
"old2new = {\n",
|
| 283 |
" \"Assay (Percent degradation)\": \"Percent degradation (%)\",\n",
|
|
|
|
| 938 |
}
|
| 939 |
],
|
| 940 |
"source": [
|
| 941 |
+
"df_file = os.path.join(data_dir, 'PROTAC-Pedia.csv')\n",
|
| 942 |
"protac_pedia_df = pd.read_csv(df_file)\n",
|
| 943 |
"print(f'protac_pedia_df len: {len(protac_pedia_df)}')\n",
|
| 944 |
"protac_pedia_df.head()"
|
notebooks/protac_degradation_predictor.ipynb
CHANGED
|
@@ -243,7 +243,7 @@
|
|
| 243 |
"source": [
|
| 244 |
"import pandas as pd\n",
|
| 245 |
"\n",
|
| 246 |
-
"protac_df = pd.read_csv('../data/
|
| 247 |
"protac_df.head()"
|
| 248 |
]
|
| 249 |
},
|
|
@@ -341,7 +341,23 @@
|
|
| 341 |
"cell_type": "markdown",
|
| 342 |
"metadata": {},
|
| 343 |
"source": [
|
| 344 |
-
"Protein embeddings downloaded from [Uniprot](https://www.uniprot.org/help/embeddings)."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
]
|
| 346 |
},
|
| 347 |
{
|
|
@@ -394,7 +410,7 @@
|
|
| 394 |
"from tqdm.auto import tqdm\n",
|
| 395 |
"\n",
|
| 396 |
"protein_embeddings = {}\n",
|
| 397 |
-
"with h5py.File(\"../data/
|
| 398 |
" print(f\"number of entries: {len(file.items()):,}\")\n",
|
| 399 |
" uniprots = protac_df['Uniprot'].unique().tolist()\n",
|
| 400 |
" uniprots += protac_df['E3 Ligase Uniprot'].unique().tolist()\n",
|
|
|
|
| 243 |
"source": [
|
| 244 |
"import pandas as pd\n",
|
| 245 |
"\n",
|
| 246 |
+
"protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')\n",
|
| 247 |
"protac_df.head()"
|
| 248 |
]
|
| 249 |
},
|
|
|
|
| 341 |
"cell_type": "markdown",
|
| 342 |
"metadata": {},
|
| 343 |
"source": [
|
| 344 |
+
"Protein embeddings downloaded from [Uniprot](https://www.uniprot.org/help/embeddings).\n",
|
| 345 |
+
"\n",
|
| 346 |
+
"Please note that running the following cell the first time might take a while."
|
| 347 |
+
]
|
| 348 |
+
},
|
| 349 |
+
{
|
| 350 |
+
"cell_type": "code",
|
| 351 |
+
"execution_count": null,
|
| 352 |
+
"metadata": {},
|
| 353 |
+
"outputs": [],
|
| 354 |
+
"source": [
|
| 355 |
+
"import os\n",
|
| 356 |
+
"\n",
|
| 357 |
+
"download_link = \"https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/UP000005640_9606/per-protein.h5\"\n",
|
| 358 |
+
"embeddings_path = \"../data/uniprot2embedding.h5\"\n",
|
| 359 |
+
"if not os.path.exists(embeddings_path):\n",
|
| 360 |
+
" !wget {download_link} {embeddings_path}"
|
| 361 |
]
|
| 362 |
},
|
| 363 |
{
|
|
|
|
| 410 |
"from tqdm.auto import tqdm\n",
|
| 411 |
"\n",
|
| 412 |
"protein_embeddings = {}\n",
|
| 413 |
+
"with h5py.File(\"../data/per-protein-embeddings.h5\", \"r\") as file:\n",
|
| 414 |
" print(f\"number of entries: {len(file.items()):,}\")\n",
|
| 415 |
" uniprots = protac_df['Uniprot'].unique().tolist()\n",
|
| 416 |
" uniprots += protac_df['E3 Ligase Uniprot'].unique().tolist()\n",
|