{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "c59f55d8-e623-4f92-af20-ad2666e990c4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ubuntu/.local/lib/python3.10/site-packages/matplotlib/projections/__init__.py:63: UserWarning: Unable to import Axes3D. This may be due to multiple versions of Matplotlib being installed (e.g. as a system package and as a pip package). As a result, the 3D projection is not available.\n", " warnings.warn(\"Unable to import Axes3D. This may be due to multiple versions of \"\n" ] } ], "source": [ "import scanpy as sc\n", "import pandas as pd \n", "import anndata as ad\n", "from tqdm import tqdm" ] }, { "cell_type": "markdown", "id": "8fee8391-6ed8-4038-a307-f2a86513f903", "metadata": {}, "source": [ "## Building a combined anndata with embeddings from PCA, scVI, CVI, TF" ] }, { "cell_type": "code", "execution_count": 25, "id": "46d9e62e-660f-46a8-bfc8-dcc809c21d86", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ubuntu/.local/lib/python3.10/site-packages/anndata/_core/anndata.py:1756: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n", " utils.warn_names_duplicates(\"obs\")\n" ] } ], "source": [ "scvi_edata = sc.read_h5ad('./embeddings/inference_100x_scVI.h5ad')" ] }, { "cell_type": "code", "execution_count": 24, "id": "e2156803-4ee6-4839-bc7a-dd40e170a000", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ubuntu/.local/lib/python3.10/site-packages/anndata/_core/anndata.py:1756: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n", " utils.warn_names_duplicates(\"obs\")\n" ] } ], "source": [ "cvi_edata = sc.read_h5ad('./embeddings/test_100x_contrastiveVI.h5ad')" ] }, { "cell_type": "code", "execution_count": null, "id": "747d7b36-d294-4658-9d6e-50dac3f21555", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 28, "id": "1b06309a-5121-41e6-89a4-e3384572b894", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 482620 × 10000\n", " obs: 'sample', 'gene_count', 'tscp_count', 'mread_count', 'drugname_drugconc', 'drug', 'cell_line', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'pass_filter', 'cell_name', 'plate', 'train_val_test_split', '_scvi_batch', '_scvi_labels'\n", " uns: '_scvi_manager_uuid', '_scvi_uuid'\n", " obsm: 'background_rep_contrastiveVI', 'salient_rep_contrastiveVI'\n", " layers: 'raw'" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cvi_edata" ] }, { "cell_type": "code", "execution_count": 29, "id": "fd6b8d89-830e-46ff-9d6a-36b72179d4a6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 482620 × 10000\n", " obs: 'sample', 'gene_count', 'tscp_count', 'mread_count', 'drugname_drugconc', 'drug', 'cell_line', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'pass_filter', 'cell_name', 'plate', 'train_val_test_split', '_scvi_batch', '_scvi_labels'\n", " uns: '_scvi_manager_uuid', '_scvi_uuid', 'neighbors', 'umap'\n", " obsm: 'X_scVI'\n", " layers: 'raw'\n", " obsp: 'connectivities', 'distances'" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scvi_edata" ] }, { "cell_type": "code", "execution_count": null, "id": "a3b5e8a7-d69c-4b32-b636-08b68babf4f4", "metadata": {}, "outputs": [], "source": [ "adata.obsm['salient_rep_contrastiveVI'] " ] }, { "cell_type": "code", "execution_count": null, "id": "33051582-031c-4ee8-9afb-9e97905d518f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "d0fe68d5-b75b-4171-96de-62cf17d9f133", "metadata": {}, "source": [ "## Collapse cVI" ] }, { "cell_type": "code", "execution_count": 33, "id": "49b4a3c0-9dff-4df2-bd6e-1b96054518a3", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_51540/1378527330.py:21: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", " .groupby([\"drug\", \"cell_line\"])[feat_cols]\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import scanpy as sc\n", "from anndata import AnnData\n", "\n", "\n", "feat_cols = [f\"drug_emb_{i}\" for i in range(cvi_edata.obsm['salient_rep_contrastiveVI'].shape[1])]\n", "df_embed = pd.DataFrame(\n", " cvi_edata.obsm['salient_rep_contrastiveVI'],\n", " index=cvi_edata.obs_names,\n", " columns=feat_cols\n", ")\n", "\n", "df_embed = df_embed.join(\n", " cvi_edata.obs[[\"drug\", \"cell_line\"]]\n", ")\n", "\n", "#Aggregate (mean) per (drug, cell_line)\n", "grp = (\n", " df_embed\n", " .groupby([\"drug\", \"cell_line\"])[feat_cols]\n", " .mean()\n", " .reset_index()\n", ")\n", "\n", "# Extract DMSO_TF (control) per cell_line\n", "ctrl = (\n", " grp[grp[\"drug\"] == \"DMSO_TF\"]\n", " .set_index(\"cell_line\")[feat_cols]\n", " .rename(lambda c: f\"ctrl_{c}\", axis=1)\n", ")\n", "\n", "# Prepare the “drug vs. DMSO” concatenated DataFram# - drop the DMSO rows\n", "df_pairs = grp[grp[\"drug\"] != \"DMSO_TF\"].copy()\n", "\n", "# merge in the control embeddings\n", "df_pairs = df_pairs.merge(\n", " ctrl,\n", " left_on=\"cell_line\",\n", " right_index=True,\n", " how=\"left\"\n", ")\n" ] }, { "cell_type": "code", "execution_count": 34, "id": "fb2ec2d1-b222-4f63-bc18-27180a7af67b", "metadata": {}, "outputs": [], "source": [ "## Build the new AnnData\n", "# - X is the concatenated vector of [drug_avg, ctrl_avg]\n", "feature_cols = feat_cols + list(ctrl.columns)\n", "X_concat = df_pairs[feature_cols].values\n", "\n", "# - obs is the metadata of each drug-cell pair\n", "obs_meta = df_pairs[[\"drug\", \"cell_line\"]].copy()\n", "obs_meta.index = pd.Index(\n", " [f\"{d}__{cl}\" for d, cl in zip(obs_meta[\"drug\"], obs_meta[\"cell_line\"])],\n", " name=\"drug|cell_line\"\n", ")\n", "# Create var DataFrame with feature names\n", "var = pd.DataFrame(index=pd.Index(feature_cols, name=\"features\"))\n", "cVI_collapsed_adata = AnnData(\n", " X=X_concat,\n", " obs=obs_meta,\n", " var=var\n", ")\n" ] }, { "cell_type": "code", "execution_count": 35, "id": "38cdaa61-ae14-4ed0-9ac3-d018626ec1b9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 18950 × 20\n", " obs: 'drug', 'cell_line'" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cVI_collapsed_adata" ] }, { "cell_type": "markdown", "id": "7c004cad-06a4-4506-bd4b-f175b0822958", "metadata": {}, "source": [ "## Collapse scVI" ] }, { "cell_type": "code", "execution_count": 30, "id": "3768b9a0-a5c5-407f-928d-8bcbec8985af", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_51540/1170690059.py:21: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", " .groupby([\"drug\", \"cell_line\"])[feat_cols]\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import scanpy as sc\n", "from anndata import AnnData\n", "\n", "\n", "feat_cols = [f\"drug_emb_{i}\" for i in range(scvi_edata.obsm['X_scVI'].shape[1])]\n", "df_embed = pd.DataFrame(\n", " scvi_edata.obsm['X_scVI'],\n", " index=scvi_edata.obs_names,\n", " columns=feat_cols\n", ")\n", "\n", "df_embed = df_embed.join(\n", " scvi_edata.obs[[\"drug\", \"cell_line\"]]\n", ")\n", "\n", "#Aggregate (mean) per (drug, cell_line)\n", "grp = (\n", " df_embed\n", " .groupby([\"drug\", \"cell_line\"])[feat_cols]\n", " .mean()\n", " .reset_index()\n", ")\n", "\n", "# Extract DMSO_TF (control) per cell_line\n", "ctrl = (\n", " grp[grp[\"drug\"] == \"DMSO_TF\"]\n", " .set_index(\"cell_line\")[feat_cols]\n", " .rename(lambda c: f\"ctrl_{c}\", axis=1)\n", ")\n", "\n", "# Prepare the “drug vs. DMSO” concatenated DataFram# - drop the DMSO rows\n", "df_pairs = grp[grp[\"drug\"] != \"DMSO_TF\"].copy()\n", "\n", "# merge in the control embeddings\n", "df_pairs = df_pairs.merge(\n", " ctrl,\n", " left_on=\"cell_line\",\n", " right_index=True,\n", " how=\"left\"\n", ")\n" ] }, { "cell_type": "code", "execution_count": 31, "id": "3bc8fb79-c00d-47e5-9a71-7600b49cb82a", "metadata": {}, "outputs": [], "source": [ "## Build the new AnnData\n", "# - X is the concatenated vector of [drug_avg, ctrl_avg]\n", "feature_cols = feat_cols + list(ctrl.columns)\n", "X_concat = df_pairs[feature_cols].values\n", "\n", "# - obs is the metadata of each drug-cell pair\n", "obs_meta = df_pairs[[\"drug\", \"cell_line\"]].copy()\n", "obs_meta.index = pd.Index(\n", " [f\"{d}__{cl}\" for d, cl in zip(obs_meta[\"drug\"], obs_meta[\"cell_line\"])],\n", " name=\"drug|cell_line\"\n", ")\n", "# Create var DataFrame with feature names\n", "var = pd.DataFrame(index=pd.Index(feature_cols, name=\"features\"))\n", "scVI_collapsed_adata = AnnData(\n", " X=X_concat,\n", " obs=obs_meta,\n", " var=var\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "788844e3-fc8c-4552-bbe6-4b76101b393a", "metadata": {}, "outputs": [], "source": [ "scVI_collapsed_adata" ] }, { "cell_type": "code", "execution_count": 32, "id": "ad83ffbc-9528-4d0a-a26b-b2dd7309bd3c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 18950 × 20\n", " obs: 'drug', 'cell_line'" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scVI_collapsed_adata" ] }, { "cell_type": "code", "execution_count": 39, "id": "bdfabdf8-8d9f-4635-9a72-8866e85765da", "metadata": {}, "outputs": [], "source": [ "scVI_collapsed_adata.obsm = {\n", " 'scVI': scVI_collapsed_adata.X.copy(),\n", " 'cVI': cVI_collapsed_adata.X.copy()\n", "}" ] }, { "cell_type": "code", "execution_count": 40, "id": "bba5a69b-30e0-4733-abd9-ed2b83cfd69e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 18950 × 20\n", " obs: 'drug', 'cell_line'\n", " obsm: 'scVI', 'cVI'" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scVI_collapsed_adata" ] }, { "cell_type": "code", "execution_count": 42, "id": "df29123a-6b8e-4248-b6a4-1acdfd0715b4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
features
drug_emb_0
drug_emb_1
drug_emb_2
drug_emb_3
drug_emb_4
drug_emb_5
drug_emb_6
drug_emb_7
drug_emb_8
drug_emb_9
ctrl_drug_emb_0
ctrl_drug_emb_1
ctrl_drug_emb_2
ctrl_drug_emb_3
ctrl_drug_emb_4
ctrl_drug_emb_5
ctrl_drug_emb_6
ctrl_drug_emb_7
ctrl_drug_emb_8
ctrl_drug_emb_9
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: []\n", "Index: [drug_emb_0, drug_emb_1, drug_emb_2, drug_emb_3, drug_emb_4, drug_emb_5, drug_emb_6, drug_emb_7, drug_emb_8, drug_emb_9, ctrl_drug_emb_0, ctrl_drug_emb_1, ctrl_drug_emb_2, ctrl_drug_emb_3, ctrl_drug_emb_4, ctrl_drug_emb_5, ctrl_drug_emb_6, ctrl_drug_emb_7, ctrl_drug_emb_8, ctrl_drug_emb_9]" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scVI_collapsed_adata.var" ] }, { "cell_type": "code", "execution_count": 41, "id": "127effb2-c032-448c-9699-06245b0ced54", "metadata": {}, "outputs": [], "source": [ "import scanpy as sc\n", "\n", "# Assume you have an AnnData object called adata\n", "sc.write('tahoe_100x_scvI_vi_aggregated.h5ad', scVI_collapsed_adata, compression='gzip')" ] }, { "cell_type": "code", "execution_count": null, "id": "597d5f86-0b64-4b30-be0a-e82b140ae243", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "06b965ad-7ac3-4131-bf16-2c83658fe5bd", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "75e35b62-6dd2-4da7-9654-3b27ec71c3ae", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "9c9b69b4-634a-4144-9748-500f8873de43", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "50262e7a-9603-469c-8133-e301b82f9b0d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "e697a308-3714-4444-89d6-6d154c4110e1", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "d702eccc-d2ee-4ff1-a9c4-2553f1e90150", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "bb7fafed-3b16-4f69-8e4f-9f624679d82e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "f5aa4408-d01c-40df-b771-7c927ad4ebe9", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "0f492174-4bcf-43d8-b5a1-9febe41dba56", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 14, "id": "54ff654f-73ad-4ec5-a583-4975b89359fa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test_plate14_100x.h5ad\n", "test_shared_MOA_100x.h5ad\n", "test_shared_organ_100x.h5ad\n", "test_unseen_MOA_100x.h5ad\n", "test_unseen_organ_100x.h5ad\n", "train_100x.h5ad\n", "val_plate14_100x.h5ad\n", "val_shared_MOA_100x.h5ad\n", "val_shared_organ_100x.h5ad\n", "val_unseen_MOA_100x.h5ad\n", "val_unseen_organ_100x.h5ad\n" ] } ], "source": [ "import os\n", "ad_1000x =[]\n", "for file in os.listdir('./subsets/hvgs/100x'):\n", "\n", " print(file)\n", " adata = sc.read_h5ad(f\"./subsets/hvgs/100x/{file}\")\n", " adata.obs['train_val_test_split'] = file.split(\"_\")[0]\n", " ad_1000x.append(adata)" ] }, { "cell_type": "code", "execution_count": 16, "id": "0534fada-9584-4d05-af1c-21a400bb879e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ubuntu/.local/lib/python3.10/site-packages/anndata/_core/anndata.py:1756: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n", " utils.warn_names_duplicates(\"obs\")\n" ] } ], "source": [ "inference_data = ad.concat(ad_1000x)" ] }, { "cell_type": "code", "execution_count": 17, "id": "2a255237-ba02-498e-a4e2-5b03e1115f47", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(482620, 10000)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inference_data.shape" ] }, { "cell_type": "code", "execution_count": 19, "id": "3fe6c670-79b5-4fbf-973c-19725d59889f", "metadata": {}, "outputs": [], "source": [ "import scanpy as sc\n", "\n", "# Assume you have an AnnData object called adata\n", "sc.write('tahoe_100x_inference.h5ad', inference_data, compression='gzip')" ] }, { "cell_type": "code", "execution_count": 190, "id": "0c4bcff3-e844-41f4-b1dd-5a74864fa265", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
samplegene_counttscp_countmread_countdrugname_drugconcdrugcell_linesublibraryBARCODEpcnt_mitoS_scoreG2M_scorephasepass_filtercell_nameplate
BARCODE_SUB_LIB_ID
95_127_189-lib_877smp_1589608747889[('DMSO_TF', 0.0, 'uM')]DMSO_TFCVCL_0504lib_87795_127_1890.0789830.009228-0.061905SfullRKOplate1
95_165_114-lib_850smp_1589196934864262[('DMSO_TF', 0.0, 'uM')]DMSO_TFCVCL_1097lib_85095_165_1140.042742-0.242857-0.258097G1fullC32plate1
96_094_163-lib_898smp_15907039651186[('DMSO_TF', 0.0, 'uM')]DMSO_TFCVCL_1717lib_89896_094_1630.078756-0.0336070.000000G2MfullSW1417plate1
96_146_131-lib_889smp_1590151122142749[('DMSO_TF', 0.0, 'uM')]DMSO_TFCVCL_0428lib_88996_146_1310.050587-0.091137-0.100962G1fullMIA PaCa-2plate1
96_107_116-lib_898smp_15905818251002[('DMSO_TF', 0.0, 'uM')]DMSO_TFCVCL_0334lib_89896_107_1160.080000-0.0239230.033654G2MfullHs 766Tplate1
...................................................
48_059_091-lib_2083smp_2790118217322060[('Ritonavir', 5.0, 'uM')]RitonavirCVCL_0218lib_208348_059_0910.0612010.531746-0.034631SfullCOLO 205plate14
54_034_175-lib_2112smp_279692612131425[('Lidocaine (hydrochloride)', 5.0, 'uM')]Lidocaine (hydrochloride)CVCL_1635lib_211254_034_1750.046167-0.085714-0.066484G1fullPanc 03.27plate14
45_045_054-lib_2080smp_278772710371247[('Vilanterol', 5.0, 'uM')]VilanterolCVCL_0320lib_208045_045_0540.101254-0.087649-0.066986G1fullHT-29plate14
01_064_159-lib_2069smp_2743166825012966[('8-Hydroxyquinoline', 5.0, 'uM')]8-HydroxyquinolineCVCL_1547lib_206901_064_1590.0363850.4095240.149267SfullNCI-H23plate14
21_190_144-lib_2098smp_2763198233463881[('Megestrol', 5.0, 'uM')]MegestrolCVCL_1495lib_209821_190_1440.045129-0.181932-0.127839G1fullNCI-H1792plate14
\n", "

4780 rows × 16 columns

\n", "
" ], "text/plain": [ " sample gene_count tscp_count mread_count \\\n", "BARCODE_SUB_LIB_ID \n", "95_127_189-lib_877 smp_1589 608 747 889 \n", "95_165_114-lib_850 smp_1589 1969 3486 4262 \n", "96_094_163-lib_898 smp_1590 703 965 1186 \n", "96_146_131-lib_889 smp_1590 1511 2214 2749 \n", "96_107_116-lib_898 smp_1590 581 825 1002 \n", "... ... ... ... ... \n", "48_059_091-lib_2083 smp_2790 1182 1732 2060 \n", "54_034_175-lib_2112 smp_2796 926 1213 1425 \n", "45_045_054-lib_2080 smp_2787 727 1037 1247 \n", "01_064_159-lib_2069 smp_2743 1668 2501 2966 \n", "21_190_144-lib_2098 smp_2763 1982 3346 3881 \n", "\n", " drugname_drugconc \\\n", "BARCODE_SUB_LIB_ID \n", "95_127_189-lib_877 [('DMSO_TF', 0.0, 'uM')] \n", "95_165_114-lib_850 [('DMSO_TF', 0.0, 'uM')] \n", "96_094_163-lib_898 [('DMSO_TF', 0.0, 'uM')] \n", "96_146_131-lib_889 [('DMSO_TF', 0.0, 'uM')] \n", "96_107_116-lib_898 [('DMSO_TF', 0.0, 'uM')] \n", "... ... \n", "48_059_091-lib_2083 [('Ritonavir', 5.0, 'uM')] \n", "54_034_175-lib_2112 [('Lidocaine (hydrochloride)', 5.0, 'uM')] \n", "45_045_054-lib_2080 [('Vilanterol', 5.0, 'uM')] \n", "01_064_159-lib_2069 [('8-Hydroxyquinoline', 5.0, 'uM')] \n", "21_190_144-lib_2098 [('Megestrol', 5.0, 'uM')] \n", "\n", " drug cell_line sublibrary \\\n", "BARCODE_SUB_LIB_ID \n", "95_127_189-lib_877 DMSO_TF CVCL_0504 lib_877 \n", "95_165_114-lib_850 DMSO_TF CVCL_1097 lib_850 \n", "96_094_163-lib_898 DMSO_TF CVCL_1717 lib_898 \n", "96_146_131-lib_889 DMSO_TF CVCL_0428 lib_889 \n", "96_107_116-lib_898 DMSO_TF CVCL_0334 lib_898 \n", "... ... ... ... \n", "48_059_091-lib_2083 Ritonavir CVCL_0218 lib_2083 \n", "54_034_175-lib_2112 Lidocaine (hydrochloride) CVCL_1635 lib_2112 \n", "45_045_054-lib_2080 Vilanterol CVCL_0320 lib_2080 \n", "01_064_159-lib_2069 8-Hydroxyquinoline CVCL_1547 lib_2069 \n", "21_190_144-lib_2098 Megestrol CVCL_1495 lib_2098 \n", "\n", " BARCODE pcnt_mito S_score G2M_score phase \\\n", "BARCODE_SUB_LIB_ID \n", "95_127_189-lib_877 95_127_189 0.078983 0.009228 -0.061905 S \n", "95_165_114-lib_850 95_165_114 0.042742 -0.242857 -0.258097 G1 \n", "96_094_163-lib_898 96_094_163 0.078756 -0.033607 0.000000 G2M \n", "96_146_131-lib_889 96_146_131 0.050587 -0.091137 -0.100962 G1 \n", "96_107_116-lib_898 96_107_116 0.080000 -0.023923 0.033654 G2M \n", "... ... ... ... ... ... \n", "48_059_091-lib_2083 48_059_091 0.061201 0.531746 -0.034631 S \n", "54_034_175-lib_2112 54_034_175 0.046167 -0.085714 -0.066484 G1 \n", "45_045_054-lib_2080 45_045_054 0.101254 -0.087649 -0.066986 G1 \n", "01_064_159-lib_2069 01_064_159 0.036385 0.409524 0.149267 S \n", "21_190_144-lib_2098 21_190_144 0.045129 -0.181932 -0.127839 G1 \n", "\n", " pass_filter cell_name plate \n", "BARCODE_SUB_LIB_ID \n", "95_127_189-lib_877 full RKO plate1 \n", "95_165_114-lib_850 full C32 plate1 \n", "96_094_163-lib_898 full SW1417 plate1 \n", "96_146_131-lib_889 full MIA PaCa-2 plate1 \n", "96_107_116-lib_898 full Hs 766T plate1 \n", "... ... ... ... \n", "48_059_091-lib_2083 full COLO 205 plate14 \n", "54_034_175-lib_2112 full Panc 03.27 plate14 \n", "45_045_054-lib_2080 full HT-29 plate14 \n", "01_064_159-lib_2069 full NCI-H23 plate14 \n", "21_190_144-lib_2098 full NCI-H1792 plate14 \n", "\n", "[4780 rows x 16 columns]" ] }, "execution_count": 190, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inference_data.obs" ] }, { "cell_type": "code", "execution_count": 23, "id": "d36c1381-15f8-4c95-b931-ddbddb7f93bb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
drugcell_linetrain_val_test_split
BARCODE_SUB_LIB_ID
38_132_056-lib_2077ResveratrolCVCL_1693test
56_171_034-lib_2099Mitoxantrone (dihydrochloride)CVCL_1055test
10_086_185-lib_2089Palmatine (chloride)CVCL_1381test
41_020_157-lib_2143Tofacitinib (citrate)CVCL_0359test
18_057_171-lib_2076PyridoxineCVCL_1097test
............
49_098_178-lib_2158Pentamidine (isethionate)CVCL_0359val
58_111_017-lib_2064BusulfanCVCL_0359val
03_046_142-lib_2091TrifluridineCVCL_0359val
55_191_132-lib_2102DemeclocyclineCVCL_0359val
34_186_018-lib_2073Fluvoxamine (maleate)CVCL_0359val
\n", "

482620 rows × 3 columns

\n", "
" ], "text/plain": [ " drug cell_line \\\n", "BARCODE_SUB_LIB_ID \n", "38_132_056-lib_2077 Resveratrol CVCL_1693 \n", "56_171_034-lib_2099 Mitoxantrone (dihydrochloride) CVCL_1055 \n", "10_086_185-lib_2089 Palmatine (chloride) CVCL_1381 \n", "41_020_157-lib_2143 Tofacitinib (citrate) CVCL_0359 \n", "18_057_171-lib_2076 Pyridoxine CVCL_1097 \n", "... ... ... \n", "49_098_178-lib_2158 Pentamidine (isethionate) CVCL_0359 \n", "58_111_017-lib_2064 Busulfan CVCL_0359 \n", "03_046_142-lib_2091 Trifluridine CVCL_0359 \n", "55_191_132-lib_2102 Demeclocycline CVCL_0359 \n", "34_186_018-lib_2073 Fluvoxamine (maleate) CVCL_0359 \n", "\n", " train_val_test_split \n", "BARCODE_SUB_LIB_ID \n", "38_132_056-lib_2077 test \n", "56_171_034-lib_2099 test \n", "10_086_185-lib_2089 test \n", "41_020_157-lib_2143 test \n", "18_057_171-lib_2076 test \n", "... ... \n", "49_098_178-lib_2158 val \n", "58_111_017-lib_2064 val \n", "03_046_142-lib_2091 val \n", "55_191_132-lib_2102 val \n", "34_186_018-lib_2073 val \n", "\n", "[482620 rows x 3 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inference_data.obs[['drug','cell_line','train_val_test_split']] #DMSO_TF" ] }, { "cell_type": "code", "execution_count": null, "id": "fee1e4a9-292b-4bcc-94b5-78e2c215c454", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "db8ef9e2-9eb6-42b2-b257-34e6c4f4adde", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ecf349c8-dc2f-415a-a491-cbd7b1c1298f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 159, "id": "1ec108c4-6874-4c0c-a800-f8c2233a0e8c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "download: s3://tahoe-hackathon-data/tahoe_sampled_1M.h5ad.gz to ./tahoe_sampled_1M.h5ad.gz\n" ] } ], "source": [ "# !aws s3 ls s3://tahoe-hackathon-data/ --no-sign-request\n", "!aws s3 cp s3://tahoe-hackathon-data/tahoe_sampled_1M.h5ad.gz ~/Data/tahoe_sampled_1M.h5ad.gz --no-sign-request\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b72a4de8-94e7-4132-aef9-48f4708746e8", "metadata": {}, "outputs": [], "source": [ "s3://tahoe-hackathon-data/tahoe_sampled_1M.h5ad.gz" ] }, { "cell_type": "code", "execution_count": 10, "id": "1fe008bc-92c5-4444-91f4-fc98d52f1ced", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "README.md \u001b[0m\u001b[01;34mh5ad\u001b[0m/ pseudobulk_subsample.ipynb uv.lock\n", "TFormer_test.ipynb \u001b[01;34mhackathon\u001b[0m/ pyproject.toml\n", "concentrations.csv main.py \u001b[01;34msubsets\u001b[0m/\n" ] } ], "source": [ "ls" ] }, { "cell_type": "code", "execution_count": 161, "id": "a650c179-fa4c-438d-9216-bfd97f13e6af", "metadata": {}, "outputs": [], "source": [ "tahoe_1M = sc.read_h5ad('./tahoe_sampled_1M.h5ad.gz')" ] }, { "cell_type": "code", "execution_count": 171, "id": "3203ba8f-3f30-43eb-976f-2cf1af6bdaa0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 1000000 × 62710\n", " obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'id', 'drugname_drugconc', 'drug', 'INT_ID', 'NUM.SNPS', 'NUM.READS', 'demuxlet_call', 'BEST.GUESS', 'BEST.LLK', 'NEXT.GUESS', 'NEXT.LLK', 'DIFF.LLK.BEST.NEXT', 'BEST.POSTERIOR', 'SNG.POSTERIOR', 'cell_line', 'SNG.BEST.LLK', 'SNG.NEXT.GUESS', 'SNG.NEXT.LLK', 'SNG.ONLY.POSTERIOR', 'DBL.BEST.GUESS', 'DBL.BEST.LLK', 'DIFF.LLK.SNG.DBL', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'cell_line_orig', 'pass_filter', 'cell_name'" ] }, "execution_count": 171, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tahoe_1M" ] }, { "cell_type": "code", "execution_count": 164, "id": "80d0634e-8ba4-4cc8-a1f6-844e504e70b1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['sample', 'species', 'gene_count', 'tscp_count', 'mread_count',\n", " 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well',\n", " 'id', 'drugname_drugconc', 'drug', 'INT_ID', 'NUM.SNPS', 'NUM.READS',\n", " 'demuxlet_call', 'BEST.GUESS', 'BEST.LLK', 'NEXT.GUESS', 'NEXT.LLK',\n", " 'DIFF.LLK.BEST.NEXT', 'BEST.POSTERIOR', 'SNG.POSTERIOR', 'cell_line',\n", " 'SNG.BEST.LLK', 'SNG.NEXT.GUESS', 'SNG.NEXT.LLK', 'SNG.ONLY.POSTERIOR',\n", " 'DBL.BEST.GUESS', 'DBL.BEST.LLK', 'DIFF.LLK.SNG.DBL', 'sublibrary',\n", " 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase',\n", " 'cell_line_orig', 'pass_filter', 'cell_name'],\n", " dtype='object')" ] }, "execution_count": 164, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tahoe_1M.obs.columns" ] }, { "cell_type": "code", "execution_count": 170, "id": "9a049a46-454c-45a4-8ea9-13b707f7fba9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "93" ] }, "execution_count": 170, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tahoe_1M.obs['drug'].nunique()" ] }, { "cell_type": "code", "execution_count": 163, "id": "e4a22f48-e635-42d9-b29c-fbb4a6bb64ed", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
samplespeciesgene_counttscp_countmread_countbc1_windbc2_windbc3_windbc1_wellbc2_well...DIFF.LLK.SNG.DBLsublibraryBARCODEpcnt_mitoS_scoreG2M_scorephasecell_line_origpass_filtercell_name
BARCODE_SUB_LIB_ID
75_040_049-lib_2604smp_1761hg38109915301791754049G3D4...-1.70lib_260475_040_0490.0549020.3052550.048399SCVCL_0359fullJ82
75_108_162-lib_1051smp_1761hg3810161302156575108162G3p2.A12...1.83lib_105175_108_1620.033026-0.0479980.404762G2MCVCL_0023fullA549
32_130_058-lib_1047smp_1718hg381677251129543213058C8p2.C10...10.49lib_104732_130_0580.0927920.1507940.391392G2MCVCL_0480fullPANC-1
88_105_132-lib_1033smp_1774hg3861678892788105132H4p2.A9...0.85lib_103388_105_1320.022843-0.056000-0.033150G1CVCL_1285fullHOP62
45_003_187-lib_1099smp_1731hg38102213971679453187D9A3...1.95lib_109945_003_1870.0787400.059524-0.008974SCVCL_0131fullA-172
\n", "

5 rows × 42 columns

\n", "
" ], "text/plain": [ " sample species gene_count tscp_count mread_count \\\n", "BARCODE_SUB_LIB_ID \n", "75_040_049-lib_2604 smp_1761 hg38 1099 1530 1791 \n", "75_108_162-lib_1051 smp_1761 hg38 1016 1302 1565 \n", "32_130_058-lib_1047 smp_1718 hg38 1677 2511 2954 \n", "88_105_132-lib_1033 smp_1774 hg38 616 788 927 \n", "45_003_187-lib_1099 smp_1731 hg38 1022 1397 1679 \n", "\n", " bc1_wind bc2_wind bc3_wind bc1_well bc2_well ... \\\n", "BARCODE_SUB_LIB_ID ... \n", "75_040_049-lib_2604 75 40 49 G3 D4 ... \n", "75_108_162-lib_1051 75 108 162 G3 p2.A12 ... \n", "32_130_058-lib_1047 32 130 58 C8 p2.C10 ... \n", "88_105_132-lib_1033 88 105 132 H4 p2.A9 ... \n", "45_003_187-lib_1099 45 3 187 D9 A3 ... \n", "\n", " DIFF.LLK.SNG.DBL sublibrary BARCODE pcnt_mito \\\n", "BARCODE_SUB_LIB_ID \n", "75_040_049-lib_2604 -1.70 lib_2604 75_040_049 0.054902 \n", "75_108_162-lib_1051 1.83 lib_1051 75_108_162 0.033026 \n", "32_130_058-lib_1047 10.49 lib_1047 32_130_058 0.092792 \n", "88_105_132-lib_1033 0.85 lib_1033 88_105_132 0.022843 \n", "45_003_187-lib_1099 1.95 lib_1099 45_003_187 0.078740 \n", "\n", " S_score G2M_score phase cell_line_orig pass_filter \\\n", "BARCODE_SUB_LIB_ID \n", "75_040_049-lib_2604 0.305255 0.048399 S CVCL_0359 full \n", "75_108_162-lib_1051 -0.047998 0.404762 G2M CVCL_0023 full \n", "32_130_058-lib_1047 0.150794 0.391392 G2M CVCL_0480 full \n", "88_105_132-lib_1033 -0.056000 -0.033150 G1 CVCL_1285 full \n", "45_003_187-lib_1099 0.059524 -0.008974 S CVCL_0131 full \n", "\n", " cell_name \n", "BARCODE_SUB_LIB_ID \n", "75_040_049-lib_2604 J82 \n", "75_108_162-lib_1051 A549 \n", "32_130_058-lib_1047 PANC-1 \n", "88_105_132-lib_1033 HOP62 \n", "45_003_187-lib_1099 A-172 \n", "\n", "[5 rows x 42 columns]" ] }, "execution_count": 163, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tahoe_1M.obs.head()" ] }, { "cell_type": "code", "execution_count": 175, "id": "234b073c-e3e3-4b5e-a979-3c0da3a38c9a", "metadata": {}, "outputs": [], "source": [ "tahoe_1M.obs['drug_cell_line'] = tahoe_1M.obs['drug'].astype(str) + \"_\" + tahoe_1M.obs['cell_line_orig'].astype(str)" ] }, { "cell_type": "code", "execution_count": null, "id": "8db53660-c70c-4807-962c-349d0e75fa37", "metadata": {}, "outputs": [], "source": [ "# PCA on full data\n", "sc.tl.pca(adata, svd_solver='arpack')\n", "total_variance = np.sum(adata.uns['pca']['variance'])\n", "\n", "# PCA on HVG subset\n", "adata_hvg = adata[:, adata.var['highly_variable']].copy()\n", "sc.tl.pca(adata_hvg, svd_solver='arpack')\n", "hvg_variance = np.sum(adata_hvg.uns['pca']['variance'])\n", "\n", "print(f\"Total variance (full): {total_variance:.2f}\")\n", "print(f\"Variance with HVGs only: {hvg_variance:.2f} ({(hvg_variance / total_variance * 100):.1f}%)\")\n" ] }, { "cell_type": "code", "execution_count": 178, "id": "ced1afce-3dde-4eb7-a4ed-a6bef18a965b", "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "b'There are other near singularities as well. 0.090619'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_14237/2218987055.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mtahoe_1M_batch_filtered\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtahoe_1M\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtahoe_1M\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'drug_cell_line'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalid_batches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0msc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhighly_variable_genes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtahoe_1M_batch_filtered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflavor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"seurat_v3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_top_genes\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mn_top_genes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_key\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'drug_cell_line'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mhvgs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtahoe_1M_batch_filtered\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvar\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtahoe_1M_batch_filtered\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvar\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'highly_variable'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"HVGs found: {len(hvgs)}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/.local/lib/python3.10/site-packages/legacy_api_wrap/__init__.py\u001b[0m in \u001b[0;36mfn_compatible\u001b[0;34m(*args_all, **kw)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfn_compatible\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs_all\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mP\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mP\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mR\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs_all\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mn_positional\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 82\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs_all\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0margs_pos\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mP\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/.local/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py\u001b[0m in \u001b[0;36mhighly_variable_genes\u001b[0;34m(adata, layer, n_top_genes, min_disp, max_disp, min_mean, max_mean, span, n_bins, flavor, subset, inplace, batch_key, check_values)\u001b[0m\n\u001b[1;32m 658\u001b[0m \u001b[0msig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msignature\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_highly_variable_genes_seurat_v3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 659\u001b[0m \u001b[0mn_top_genes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"int\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"n_top_genes\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefault\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 660\u001b[0;31m return _highly_variable_genes_seurat_v3(\n\u001b[0m\u001b[1;32m 661\u001b[0m \u001b[0madata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 662\u001b[0m \u001b[0mflavor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mflavor\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/.local/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py\u001b[0m in \u001b[0;36m_highly_variable_genes_seurat_v3\u001b[0;34m(adata, flavor, layer, n_top_genes, batch_key, check_values, span, subset, inplace)\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog10\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnot_const\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspan\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspan\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdegree\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 97\u001b[0m \u001b[0mestimat_var\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnot_const\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfitted_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0mreg_std\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msqrt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mestimat_var\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m_loess.pyx\u001b[0m in \u001b[0;36m_loess.loess.fit\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: b'There are other near singularities as well. 0.090619'" ] } ], "source": [ "batch_counts = tahoe_1M.obs['drug_cell_line'].value_counts()\n", "valid_batches = batch_counts[batch_counts >= 20].index\n", "tahoe_1M_batch_filtered = tahoe_1M[tahoe_1M.obs['drug_cell_line'].isin(valid_batches)].copy()\n", "\n", "sc.pp.highly_variable_genes(tahoe_1M_batch_filtered, flavor=\"seurat_v3\", n_top_genes=n_top_genes, batch_key='drug_cell_line')\n", "hvgs = tahoe_1M_batch_filtered.var[tahoe_1M_batch_filtered.var['highly_variable']].index\n", "print(f\"HVGs found: {len(hvgs)}\")" ] }, { "cell_type": "code", "execution_count": 177, "id": "56b75f39-f84b-4670-ab04-3071f8e1e280", "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "b'There are other near singularities as well. 0.090619'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_14237/1625787329.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mn_top_genes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m500\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0msc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhighly_variable_genes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtahoe_1M\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflavor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"seurat_v3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_top_genes\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mn_top_genes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_key\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'drug_cell_line'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/.local/lib/python3.10/site-packages/legacy_api_wrap/__init__.py\u001b[0m in \u001b[0;36mfn_compatible\u001b[0;34m(*args_all, **kw)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfn_compatible\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs_all\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mP\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mP\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mR\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs_all\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mn_positional\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 82\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs_all\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0margs_pos\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mP\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/.local/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py\u001b[0m in \u001b[0;36mhighly_variable_genes\u001b[0;34m(adata, layer, n_top_genes, min_disp, max_disp, min_mean, max_mean, span, n_bins, flavor, subset, inplace, batch_key, check_values)\u001b[0m\n\u001b[1;32m 658\u001b[0m \u001b[0msig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msignature\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_highly_variable_genes_seurat_v3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 659\u001b[0m \u001b[0mn_top_genes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"int\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"n_top_genes\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefault\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 660\u001b[0;31m return _highly_variable_genes_seurat_v3(\n\u001b[0m\u001b[1;32m 661\u001b[0m \u001b[0madata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 662\u001b[0m \u001b[0mflavor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mflavor\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/.local/lib/python3.10/site-packages/scanpy/preprocessing/_highly_variable_genes.py\u001b[0m in \u001b[0;36m_highly_variable_genes_seurat_v3\u001b[0;34m(adata, flavor, layer, n_top_genes, batch_key, check_values, span, subset, inplace)\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog10\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnot_const\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspan\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspan\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdegree\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 97\u001b[0m \u001b[0mestimat_var\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnot_const\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfitted_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0mreg_std\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msqrt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mestimat_var\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m_loess.pyx\u001b[0m in \u001b[0;36m_loess.loess.fit\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: b'There are other near singularities as well. 0.090619'" ] } ], "source": [ "n_top_genes = 500\n", "sc.pp.highly_variable_genes(tahoe_1M, flavor=\"seurat_v3\", n_top_genes=n_top_genes, batch_key='drug_cell_line')" ] }, { "cell_type": "code", "execution_count": null, "id": "4e54b14e-c964-4c69-92c8-5e93d3ff1d2e", "metadata": {}, "outputs": [], "source": [ "import scanpy as sc\n", "import pandas as pd\n", "import numpy as np\n", "from joblib import Parallel, delayed\n", "\n", "# Step 1: Define unique drug-cell line combinations\n", "combinations = adata.obs[[\"compound\", \"cell_line\"]].drop_duplicates().values.tolist()\n", "\n", "# Step 2: Function to run HVG per combination\n", "def get_hvgs_for_combination(compound, cell_line, n_top_genes=2000):\n", " # Subset the AnnData object\n", " subset = adata[(adata.obs['compound'] == compound) & (adata.obs['cell_line'] == cell_line)].copy()\n", " \n", " # Check if enough cells to compute HVG\n", " if subset.n_obs < 10:\n", " return set() # skip groups with too few cells\n", "\n", " try:\n", " sc.pp.highly_variable_genes(subset, flavor=\"seurat_v3\", n_top_genes=n_top_genes, batch_key='drug_cell_line')\n", " return set(subset.var_names[subset.var['highly_variable']])\n", " except Exception as e:\n", " print(f\"Failed for {compound}-{cell_line}: {e}\")\n", " return set()\n", "\n", "# Step 3: Run in parallel\n", "from joblib import Parallel, delayed\n", "\n", "results = Parallel(n_jobs=-1)(\n", " delayed(get_hvgs_for_combination)(compound, cell_line)\n", " for compound, cell_line in combinations\n", ")\n", "\n", "# Step 4: Union of all HVGs\n", "union_hvgs = set.union(*results)\n", "\n", "# Step 5: Subset original AnnData\n", "adata_hvg_subset = adata[:, list(union_hvgs)].copy()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "aebb3759-a28d-43e8-928d-aee159f825de", "metadata": {}, "outputs": [], "source": [ "import scanpy as sc\n", "import pandas as pd\n", "import numpy as np\n", "from joblib import Parallel, delayed\n", "\n", "# Step 1: Define unique drug-cell line combinations\n", "combinations = adata.obs[[\"compound\", \"cell_line\"]].drop_duplicates().values.tolist()\n", "\n", "# Step 2: Function to run HVG per combination\n", "def get_hvgs_for_combination(compound, cell_line, n_top_genes=2000):\n", " # Subset the AnnData object\n", " subset = adata[(adata.obs['compound'] == compound) & (adata.obs['cell_line'] == cell_line)].copy()\n", " \n", " # Check if enough cells to compute HVG\n", " if subset.n_obs < 10:\n", " return set() # skip groups with too few cells\n", "\n", " try:\n", " sc.pp.highly_variable_genes(subset, flavor=\"seurat_v3\", n_top_genes=n_top_genes, batch_key='drug_cell_line')\n", " return set(subset.var_names[subset.var['highly_variable']])\n", " except Exception as e:\n", " print(f\"Failed for {compound}-{cell_line}: {e}\")\n", " return set()\n", "\n", "# Step 3: Run in parallel\n", "from joblib import Parallel, delayed\n", "\n", "results = Parallel(n_jobs=-1)(\n", " delayed(get_hvgs_for_combination)(compound, cell_line)\n", " for compound, cell_line in combinations\n", ")\n", "\n", "# Step 4: Union of all HVGs\n", "union_hvgs = set.union(*results)\n", "\n", "# Step 5: Subset original AnnData\n", "adata_hvg_subset = adata[:, list(union_hvgs)].copy()\n" ] }, { "cell_type": "code", "execution_count": 162, "id": "457b03e0-982f-487a-8eed-ba0cd68a5a2d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 1000000 × 62710\n", " obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'id', 'drugname_drugconc', 'drug', 'INT_ID', 'NUM.SNPS', 'NUM.READS', 'demuxlet_call', 'BEST.GUESS', 'BEST.LLK', 'NEXT.GUESS', 'NEXT.LLK', 'DIFF.LLK.BEST.NEXT', 'BEST.POSTERIOR', 'SNG.POSTERIOR', 'cell_line', 'SNG.BEST.LLK', 'SNG.NEXT.GUESS', 'SNG.NEXT.LLK', 'SNG.ONLY.POSTERIOR', 'DBL.BEST.GUESS', 'DBL.BEST.LLK', 'DIFF.LLK.SNG.DBL', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'cell_line_orig', 'pass_filter', 'cell_name'" ] }, "execution_count": 162, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tahoe_1M.obs.value_counts(['drug',''" ] }, { "cell_type": "code", "execution_count": 12, "id": "6ac91fc6-da2f-4194-adcd-6a9dd602b658", "metadata": {}, "outputs": [], "source": [ "pseudobulk_adata = sc.read_h5ad('./hackathon/20250213.Tahoe.merged.pseudobulk.public.h5ad')" ] }, { "cell_type": "code", "execution_count": 13, "id": "dbee674f-3359-4ace-860b-b359b9e1933f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 67018 × 62710\n", " obs: 'sample', 'drugname_drugconc', 'drug', 'n_cells', 'tscp_count', 'plate', 'Cell_Name_Vevo', 'Cell_ID_Cellosaur'\n", " var: 'gene_id', 'genome'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pseudobulk_adata" ] }, { "cell_type": "code", "execution_count": 40, "id": "f5111f01-ddf0-49b1-943d-253579831afd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "plate\n", "4 4800\n", "3 4800\n", "5 4800\n", "6 4800\n", "8 4800\n", "7 4800\n", "11 4800\n", "12 4800\n", "9 4800\n", "10 4800\n", "13 4800\n", "14 4796\n", "2 4779\n", "1 4643\n", "Name: count, dtype: int64" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pseudobulk_adata.obs['plate'].value_counts()" ] }, { "cell_type": "code", "execution_count": 41, "id": "4a9b63d6-3029-4825-8b55-144fb24b7d28", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "View of AnnData object with n_obs × n_vars = 4796 × 62710\n", " obs: 'sample', 'drugname_drugconc', 'drug', 'n_cells', 'tscp_count', 'plate', 'Cell_Name_Vevo', 'Cell_ID_Cellosaur'\n", " var: 'gene_id', 'genome'" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pseudobulk_adata[pseudobulk_adata.obs['plate']=='14']" ] }, { "cell_type": "code", "execution_count": 15, "id": "1133deee-23fb-428d-a58b-03f80fc37c2d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sampledrugname_drugconcdrugn_cellstscp_countplateCell_Name_VevoCell_ID_Cellosaur
0smp_1975[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline30766686156.06A549CVCL_0023
1smp_1975[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline15054744833.06HS-578TCVCL_0332
2smp_1975[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline17002494504.06HCT15CVCL_0292
3smp_1975[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline35608923357.06HOP62CVCL_1285
4smp_1975[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline18764676765.06SK-MEL-2CVCL_0069
...........................
4795-11smp_2742[('Ribociclib', 0.05, 'uM')]Ribociclib14122102723.013hTERT-HPNECVCL_C466
4796-10smp_2742[('Ribociclib', 0.05, 'uM')]Ribociclib16323507957.013NCI-H23CVCL_1547
4797-10smp_2742[('Ribociclib', 0.05, 'uM')]Ribociclib22255246475.013NCI-H1792CVCL_1495
4798-10smp_2742[('Ribociclib', 0.05, 'uM')]Ribociclib3131648.013NCI-H2122CVCL_1531
4799-10smp_2742[('Ribociclib', 0.05, 'uM')]Ribociclib13193911514.013NCI-H2030CVCL_1517
\n", "

67018 rows × 8 columns

\n", "
" ], "text/plain": [ " sample drugname_drugconc drug \\\n", "0 smp_1975 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "1 smp_1975 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "2 smp_1975 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "3 smp_1975 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "4 smp_1975 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "... ... ... ... \n", "4795-11 smp_2742 [('Ribociclib', 0.05, 'uM')] Ribociclib \n", "4796-10 smp_2742 [('Ribociclib', 0.05, 'uM')] Ribociclib \n", "4797-10 smp_2742 [('Ribociclib', 0.05, 'uM')] Ribociclib \n", "4798-10 smp_2742 [('Ribociclib', 0.05, 'uM')] Ribociclib \n", "4799-10 smp_2742 [('Ribociclib', 0.05, 'uM')] Ribociclib \n", "\n", " n_cells tscp_count plate Cell_Name_Vevo Cell_ID_Cellosaur \n", "0 3076 6686156.0 6 A549 CVCL_0023 \n", "1 1505 4744833.0 6 HS-578T CVCL_0332 \n", "2 1700 2494504.0 6 HCT15 CVCL_0292 \n", "3 3560 8923357.0 6 HOP62 CVCL_1285 \n", "4 1876 4676765.0 6 SK-MEL-2 CVCL_0069 \n", "... ... ... ... ... ... \n", "4795-11 1412 2102723.0 13 hTERT-HPNE CVCL_C466 \n", "4796-10 1632 3507957.0 13 NCI-H23 CVCL_1547 \n", "4797-10 2225 5246475.0 13 NCI-H1792 CVCL_1495 \n", "4798-10 31 31648.0 13 NCI-H2122 CVCL_1531 \n", "4799-10 1319 3911514.0 13 NCI-H2030 CVCL_1517 \n", "\n", "[67018 rows x 8 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pseudobulk_adata.obs" ] }, { "cell_type": "code", "execution_count": 20, "id": "49141d2c-4820-4198-ba78-dc96e9903891", "metadata": {}, "outputs": [], "source": [ "concentrations = pd.read_csv(\"concentrations.csv\")\n", "concentrations['drugname_drugconc'].tolist()" ] }, { "cell_type": "code", "execution_count": 27, "id": "37d086b3-bc3f-4cfd-9fc4-e7d5195fcf21", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "380" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(concentrations)" ] }, { "cell_type": "code", "execution_count": 32, "id": "47254800-4c8f-418a-93a9-f359d5180d68", "metadata": {}, "outputs": [], "source": [ "single_dose_pb_adata = pseudobulk_adata[pseudobulk_adata.obs['drugname_drugconc'].isin(concentrations['drugname_drugconc'].tolist())].copy()" ] }, { "cell_type": "code", "execution_count": 33, "id": "f1604a7c-656e-4185-94ad-11bdf4a178d0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 25895 × 62710\n", " obs: 'sample', 'drugname_drugconc', 'drug', 'n_cells', 'tscp_count', 'plate', 'Cell_Name_Vevo', 'Cell_ID_Cellosaur'\n", " var: 'gene_id', 'genome'" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "single_dose_pb_adata" ] }, { "cell_type": "code", "execution_count": 99, "id": "9fa045e6-c911-42e5-b8ee-7ce3a4da2e32", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "plate\n", "6 4700\n", "14 4696\n", "8 4100\n", "2 3799\n", "12 3600\n", "13 1400\n", "11 1250\n", "3 1050\n", "9 750\n", "5 150\n", "4 100\n", "1 100\n", "10 100\n", "7 100\n", "Name: count, dtype: int64" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "single_dose_pb_adata.obs['plate'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "8225459f-7508-40bd-abb1-3bbafdbcb905", "metadata": {}, "outputs": [], "source": [ "plate" ] }, { "cell_type": "code", "execution_count": 36, "id": "86f77c95-d962-4cd2-a66e-c3d0e5d0e13f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sampledrugname_drugconcdrugn_cellstscp_countplateCell_Name_VevoCell_ID_Cellosaur
0smp_1975[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline30766686156.06A549CVCL_0023
1smp_1975[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline15054744833.06HS-578TCVCL_0332
2smp_1975[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline17002494504.06HCT15CVCL_0292
3smp_1975[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline35608923357.06HOP62CVCL_1285
4smp_1975[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline18764676765.06SK-MEL-2CVCL_0069
...........................
3895-13smp_2724[('Plicamycin', 0.5, 'uM')]Plicamycin11881869984.013hTERT-HPNECVCL_C466
3896-13smp_2724[('Plicamycin', 0.5, 'uM')]Plicamycin15073413185.013NCI-H23CVCL_1547
3897-13smp_2724[('Plicamycin', 0.5, 'uM')]Plicamycin18774635019.013NCI-H1792CVCL_1495
3898-13smp_2724[('Plicamycin', 0.5, 'uM')]Plicamycin2831629.013NCI-H2122CVCL_1531
3899-13smp_2724[('Plicamycin', 0.5, 'uM')]Plicamycin12073569081.013NCI-H2030CVCL_1517
\n", "

25895 rows × 8 columns

\n", "
" ], "text/plain": [ " sample drugname_drugconc drug \\\n", "0 smp_1975 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "1 smp_1975 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "2 smp_1975 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "3 smp_1975 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "4 smp_1975 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "... ... ... ... \n", "3895-13 smp_2724 [('Plicamycin', 0.5, 'uM')] Plicamycin \n", "3896-13 smp_2724 [('Plicamycin', 0.5, 'uM')] Plicamycin \n", "3897-13 smp_2724 [('Plicamycin', 0.5, 'uM')] Plicamycin \n", "3898-13 smp_2724 [('Plicamycin', 0.5, 'uM')] Plicamycin \n", "3899-13 smp_2724 [('Plicamycin', 0.5, 'uM')] Plicamycin \n", "\n", " n_cells tscp_count plate Cell_Name_Vevo Cell_ID_Cellosaur \n", "0 3076 6686156.0 6 A549 CVCL_0023 \n", "1 1505 4744833.0 6 HS-578T CVCL_0332 \n", "2 1700 2494504.0 6 HCT15 CVCL_0292 \n", "3 3560 8923357.0 6 HOP62 CVCL_1285 \n", "4 1876 4676765.0 6 SK-MEL-2 CVCL_0069 \n", "... ... ... ... ... ... \n", "3895-13 1188 1869984.0 13 hTERT-HPNE CVCL_C466 \n", "3896-13 1507 3413185.0 13 NCI-H23 CVCL_1547 \n", "3897-13 1877 4635019.0 13 NCI-H1792 CVCL_1495 \n", "3898-13 28 31629.0 13 NCI-H2122 CVCL_1531 \n", "3899-13 1207 3569081.0 13 NCI-H2030 CVCL_1517 \n", "\n", "[25895 rows x 8 columns]" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "single_dose_pb_adata.obs" ] }, { "cell_type": "code", "execution_count": 48, "id": "9048688a-0e21-4658-a247-5aaf09a6fc57", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sampledrugname_drugconcdrugn_cellstscp_countplateCell_Name_VevoCell_ID_Cellosaur
0-1smp_2743[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline15492972368.014A549CVCL_0023
1-1smp_2743[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline8952426161.014HS-578TCVCL_0332
2-1smp_2743[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline7161007837.014HCT15CVCL_0292
3-1smp_2743[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline18864019130.014HOP62CVCL_1285
4-1smp_2743[('8-Hydroxyquinoline', 5.0, 'uM')]8-Hydroxyquinoline8451851405.014SK-MEL-2CVCL_0069
...........................
4791-1smp_2838[('DMSO_TF', 0.0, 'uM')]DMSO_TF12682070209.014hTERT-HPNECVCL_C466
4792-1smp_2838[('DMSO_TF', 0.0, 'uM')]DMSO_TF12742984503.014NCI-H23CVCL_1547
4793-1smp_2838[('DMSO_TF', 0.0, 'uM')]DMSO_TF9382223218.014NCI-H1792CVCL_1495
4794-1smp_2838[('DMSO_TF', 0.0, 'uM')]DMSO_TF1722536.014NCI-H2122CVCL_1531
4795-1smp_2838[('DMSO_TF', 0.0, 'uM')]DMSO_TF16616106461.014NCI-H2030CVCL_1517
\n", "

4696 rows × 8 columns

\n", "
" ], "text/plain": [ " sample drugname_drugconc drug \\\n", "0-1 smp_2743 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "1-1 smp_2743 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "2-1 smp_2743 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "3-1 smp_2743 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "4-1 smp_2743 [('8-Hydroxyquinoline', 5.0, 'uM')] 8-Hydroxyquinoline \n", "... ... ... ... \n", "4791-1 smp_2838 [('DMSO_TF', 0.0, 'uM')] DMSO_TF \n", "4792-1 smp_2838 [('DMSO_TF', 0.0, 'uM')] DMSO_TF \n", "4793-1 smp_2838 [('DMSO_TF', 0.0, 'uM')] DMSO_TF \n", "4794-1 smp_2838 [('DMSO_TF', 0.0, 'uM')] DMSO_TF \n", "4795-1 smp_2838 [('DMSO_TF', 0.0, 'uM')] DMSO_TF \n", "\n", " n_cells tscp_count plate Cell_Name_Vevo Cell_ID_Cellosaur \n", "0-1 1549 2972368.0 14 A549 CVCL_0023 \n", "1-1 895 2426161.0 14 HS-578T CVCL_0332 \n", "2-1 716 1007837.0 14 HCT15 CVCL_0292 \n", "3-1 1886 4019130.0 14 HOP62 CVCL_1285 \n", "4-1 845 1851405.0 14 SK-MEL-2 CVCL_0069 \n", "... ... ... ... ... ... \n", "4791-1 1268 2070209.0 14 hTERT-HPNE CVCL_C466 \n", "4792-1 1274 2984503.0 14 NCI-H23 CVCL_1547 \n", "4793-1 938 2223218.0 14 NCI-H1792 CVCL_1495 \n", "4794-1 17 22536.0 14 NCI-H2122 CVCL_1531 \n", "4795-1 1661 6106461.0 14 NCI-H2030 CVCL_1517 \n", "\n", "[4696 rows x 8 columns]" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "single_dose_pb_adata[single_dose_pb_adata.obs['plate']=='14'].obs" ] }, { "cell_type": "code", "execution_count": null, "id": "649dff1e-f33e-44c7-a43f-26e7b8db3b5d", "metadata": {}, "outputs": [], "source": [ "single_dose_pb_adata[((single_dose_pb_adata.obs[\"drug\"].isin(test_one_per_moa)) | (single_dose_pb_adata.obs[\"drug\"].isin(test_one_entire_moa))) & ((single_dose_pb_adata.obs[\"Cell_ID_Cellosaur\"].isin(test_one_per_organ)) | (single_dose_pb_adata.obs[\"Cell_ID_Cellosaur\"].isin(test_one_entire_organ))) | (single_dose_pb_adata.obs[\"plate\"] == \"plate14\")].copy()\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 100, "id": "a01009e6-2bc5-4768-8c8f-06755f053eed", "metadata": {}, "outputs": [], "source": [ "test_data = single_dose_pb_adata[(single_dose_pb_adata.obs[\"drug\"].isin(test_one_per_moa)) | (single_dose_pb_adata.obs[\"drug\"].isin(test_one_entire_moa)) | (single_dose_pb_adata.obs[\"Cell_ID_Cellosaur\"].isin(test_one_per_organ)) | (single_dose_pb_adata.obs[\"Cell_ID_Cellosaur\"].isin(test_one_entire_organ)) | (single_dose_pb_adata.obs[\"plate\"] == \"plate14\")].copy()" ] }, { "cell_type": "code", "execution_count": 105, "id": "9649654d-5b21-4a74-96b7-7723f1763622", "metadata": {}, "outputs": [], "source": [ "val_data = single_dose_pb_adata[(single_dose_pb_adata.obs[\"drug\"].isin(val_one_per_moa)) | (single_dose_pb_adata.obs[\"drug\"].isin(val_one_entire_moa)) | (single_dose_pb_adata.obs[\"Cell_ID_Cellosaur\"].isin(val_one_per_organ)) | (single_dose_pb_adata.obs[\"Cell_ID_Cellosaur\"].isin(val_one_entire_organ)) & (single_dose_pb_adata.obs[\"plate\"] != \"plate14\") ].copy()" ] }, { "cell_type": "code", "execution_count": 107, "id": "fd61a18b-5c40-4d0b-b6e2-175cb053bb2c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 5446 × 62710\n", " obs: 'sample', 'drugname_drugconc', 'drug', 'n_cells', 'tscp_count', 'plate', 'Cell_Name_Vevo', 'Cell_ID_Cellosaur'\n", " var: 'gene_id', 'genome'" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val_data" ] }, { "cell_type": "code", "execution_count": 106, "id": "bad556cd-d1ad-4f41-98cb-90cde7caf508", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4915" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(set(test_data.obs_names.tolist()).difference(set(val_data.obs_names.tolist())))" ] }, { "cell_type": "code", "execution_count": 80, "id": "577648ec-bf55-412d-bfa4-d48e4906a5fe", "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'Index' object has no attribute 'notin'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_14237/138301400.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mval_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mval_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mval_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobs_names\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnotin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobs_names\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m: 'Index' object has no attribute 'notin'" ] } ], "source": [ "val_data = val_data[(val_data.obs_names.isin(test_data.obs_names))]" ] }, { "cell_type": "code", "execution_count": 96, "id": "e8def410-7614-4aef-abb1-98e0096c6568", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(AnnData object with n_obs × n_vars = 5446 × 62710\n", " obs: 'sample', 'drugname_drugconc', 'drug', 'n_cells', 'tscp_count', 'plate', 'Cell_Name_Vevo', 'Cell_ID_Cellosaur'\n", " var: 'gene_id', 'genome',\n", " AnnData object with n_obs × n_vars = 5403 × 62710\n", " obs: 'sample', 'drugname_drugconc', 'drug', 'n_cells', 'tscp_count', 'plate', 'Cell_Name_Vevo', 'Cell_ID_Cellosaur'\n", " var: 'gene_id', 'genome')" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val_data, test_data" ] }, { "cell_type": "code", "execution_count": 97, "id": "b5c78151-5a8c-4469-96ca-71cb35f4718d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "View of AnnData object with n_obs × n_vars = 10361 × 62710\n", " obs: 'sample', 'drugname_drugconc', 'drug', 'n_cells', 'tscp_count', 'plate', 'Cell_Name_Vevo', 'Cell_ID_Cellosaur'\n", " var: 'gene_id', 'genome'" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_data = single_dose_pb_adata[(single_dose_pb_adata.obs_names.isin(val_data.obs_names)) | (single_dose_pb_adata.obs_names.isin(test_data.obs_names))]\n", "train_data" ] }, { "cell_type": "code", "execution_count": 58, "id": "3be82dba-49c2-44a0-9286-1183d90ff792", "metadata": {}, "outputs": [], "source": [ "# Define held out data for the validation and test sets\n", "val_one_per_moa = [\"Phenylephrine (hydrochloride)\",\"Darolutamide\", \"palbociclib\", \"Tolmetin\", \"Procainamide (hydrochloride)\", \"Trifluridine\", \"Simotinib\", \"Methylprednisolone succinate\", \"Dapagliflozin\", \"CP21R7\", \"Panobinostat\", \"Tofacitinib\", \"Trametinib\", \"Vinblastine (sulfate)\", \"Temsirolimus\", \"Sunitinib\", \"Ralimetinib dimesylate\", \"Tirabrutinib\", \"GSK1059615\", \"SBI-0640756\", \"Lonafarnib\", \"Retinoic acid\"]\n", "val_one_entire_moa = [\"Bortezomib\", \"Ixazomib\", \"Ixazomib citrate\"]\n", "\n", "val_one_per_organ = [\"CVCL_1724\", \"CVCL_0179\", \"CVCL_1715\", \"CVCL_0366\", \"CVCL_1550\", \"CVCL_0480\", \"CVCL_0069\" ]\n", "val_one_entire_organ = [\"CVCL_0359\"]\n", "\n", "test_one_per_moa = [\"Vilanterol\", \"Flutamide\", \"Ribociclib\", \"Valdecoxib\", \"γ-Oryzanol\", \"Trimetrexate\", \"Tucatinib\", \"Triamcinolone\", \"Dapagliflozin ((2S)-1,2-propanediol, hydrate)\", \"LY2090314\", \"Tucidinostat\", \"Tofacitinib (citrate)\", \"Trametinib (DMSO_TF solvate)\", \"vincristine\", \"Torkinib\", \"Vandetanib\", \"Temuterkib\", \"Tirabrutinib (hydrochloride)\", \"Ipatasertib\", \"Tomivosertib\", \"RMC-6236\", \"Tazarotene\"]\n", "test_one_entire_moa = [\"Glasdegib\" , \"Sonidegib\", \"Vismodegib\"]\n", "\n", "test_one_per_organ = [\"CVCL_0397\", \"CVCL_1239\", \"CVCL_0371\", \"CVCL_1716\", \"CVCL_C466\", \"CVCL_1666\", \"CVCL_0293\"]\n", "test_one_entire_organ = [\"CVCL_1094\"]\n" ] }, { "cell_type": "code", "execution_count": 109, "id": "6bd8ef40-03de-423e-b6e6-a3945688cc8e", "metadata": {}, "outputs": [], "source": [ "# Validation filters\n", "val_compounds = set([\"Phenylephrine (hydrochloride)\",\"Darolutamide\", \"palbociclib\", \"Tolmetin\", \"Procainamide (hydrochloride)\", \"Trifluridine\", \"Simotinib\", \"Methylprednisolone succinate\", \"Dapagliflozin\", \"CP21R7\", \"Panobinostat\", \"Tofacitinib\", \"Trametinib\", \"Vinblastine (sulfate)\", \"Temsirolimus\", \"Sunitinib\", \"Ralimetinib dimesylate\", \"Tirabrutinib\", \"GSK1059615\", \"SBI-0640756\", \"Lonafarnib\", \"Retinoic acid\"])\n", "val_moas = set([\"Bortezomib\", \"Ixazomib\", \"Ixazomib citrate\"])\n", "val_cell_lines = set([\"CVCL_1724\", \"CVCL_0179\", \"CVCL_1715\", \"CVCL_0366\", \"CVCL_1550\", \"CVCL_0480\", \"CVCL_0069\"])\n", "val_entire_cell_line = set([\"CVCL_0359\"])\n", "\n", "# Test filters\n", "test_compounds = set([\"Vilanterol\", \"Flutamide\", \"Ribociclib\", \"Valdecoxib\", \"γ-Oryzanol\", \"Trimetrexate\", \"Tucatinib\", \"Triamcinolone\", \"Dapagliflozin ((2S)-1,2-propanediol, hydrate)\", \"LY2090314\", \"Tucidinostat\", \"Tofacitinib (citrate)\", \"Trametinib (DMSO_TF solvate)\", \"vincristine\", \"Torkinib\", \"Vandetanib\", \"Temuterkib\", \"Tirabrutinib (hydrochloride)\", \"Ipatasertib\", \"Tomivosertib\", \"RMC-6236\", \"Tazarotene\"])\n", "test_moas = set([\"Glasdegib\" , \"Sonidegib\", \"Vismodegib\"])\n", "test_cell_lines = set([\"CVCL_0397\", \"CVCL_1239\", \"CVCL_0371\", \"CVCL_1716\", \"CVCL_C466\", \"CVCL_1666\", \"CVCL_0293\"])\n", "test_entire_cell_line = set([\"CVCL_1094\"])\n", "test_plates = set([\"14\"])\n" ] }, { "cell_type": "code", "execution_count": 124, "id": "ee1e5ddf-8dce-4a98-bf15-3898b3912f7c", "metadata": {}, "outputs": [], "source": [ "# Validation filters\n", "val_compounds = set([\"Phenylephrine (hydrochloride)\",\"Darolutamide\", \"palbociclib\", \"Tolmetin\", \"Procainamide (hydrochloride)\", \"Trifluridine\", \"Simotinib\", \"Methylprednisolone succinate\", \"Dapagliflozin\", \"CP21R7\", \"Panobinostat\", \"Tofacitinib\", \"Trametinib\", \"Vinblastine (sulfate)\", \"Temsirolimus\", \"Sunitinib\", \"Ralimetinib dimesylate\", \"Tirabrutinib\", \"GSK1059615\", \"SBI-0640756\", \"Lonafarnib\", \"Retinoic acid\"])\n", "val_moas = set([\"Bortezomib\", \"Ixazomib\", \"Ixazomib citrate\"])\n", "val_cell_lines = set([\"CVCL_1724\", \"CVCL_0179\", \"CVCL_1715\", \"CVCL_0366\", \"CVCL_1550\", \"CVCL_0480\", \"CVCL_0069\"])\n", "val_entire_cell_line = set([\"CVCL_0359\"])\n", "\n", "# Test filters\n", "test_compounds = set([\"Vilanterol\", \"Flutamide\", \"Ribociclib\", \"Valdecoxib\", \"γ-Oryzanol\", \"Trimetrexate\", \"Tucatinib\", \"Triamcinolone\", \"Dapagliflozin ((2S)-1,2-propanediol, hydrate)\", \"LY2090314\", \"Tucidinostat\", \"Tofacitinib (citrate)\", \"Trametinib (DMSO_TF solvate)\", \"vincristine\", \"Torkinib\", \"Vandetanib\", \"Temuterkib\", \"Tirabrutinib (hydrochloride)\", \"Ipatasertib\", \"Tomivosertib\", \"RMC-6236\", \"Tazarotene\"])\n", "test_moas = set([\"Glasdegib\" , \"Sonidegib\", \"Vismodegib\"])\n", "test_cell_lines = set([\"CVCL_0397\", \"CVCL_1239\", \"CVCL_0371\", \"CVCL_1716\", \"CVCL_C466\", \"CVCL_1666\", \"CVCL_0293\"])\n", "test_entire_cell_line = set([\"CVCL_1094\"])\n", "test_plates = set([\"14\"])\n", "\n", "df = single_dose_pb_adata.obs.copy()" ] }, { "cell_type": "code", "execution_count": 143, "id": "26df306f-95a2-4f42-884e-5e337e558039", "metadata": {}, "outputs": [], "source": [ "# Validation filters\n", "val_compounds = set([\"Phenylephrine (hydrochloride)\",\"Darolutamide\", \"palbociclib\", \"Tolmetin\", \"Procainamide (hydrochloride)\", \"Trifluridine\", \"Simotinib\", \"Methylprednisolone succinate\", \"Dapagliflozin\", \"CP21R7\", \"Panobinostat\", \"Tofacitinib\", \"Trametinib\", \"Vinblastine (sulfate)\", \"Temsirolimus\", \"Sunitinib\", \"Ralimetinib dimesylate\", \"Tirabrutinib\", \"GSK1059615\", \"SBI-0640756\", \"Lonafarnib\", \"Retinoic acid\"])\n", "val_moas = set([\"Bortezomib\", \"Ixazomib\", \"Ixazomib citrate\"])\n", "val_cell_lines = set([\"CVCL_1724\", \"CVCL_0179\", \"CVCL_1715\", \"CVCL_0366\", \"CVCL_1550\", \"CVCL_0480\", \"CVCL_0069\"])\n", "val_entire_cell_line = set([\"CVCL_0359\"])\n", "\n", "# Test filters\n", "test_compounds = set([\"Vilanterol\", \"Flutamide\", \"Ribociclib\", \"Valdecoxib\", \"γ-Oryzanol\", \"Trimetrexate\", \"Tucatinib\", \"Triamcinolone\", \"Dapagliflozin ((2S)-1,2-propanediol, hydrate)\", \"LY2090314\", \"Tucidinostat\", \"Tofacitinib (citrate)\", \"Trametinib (DMSO_TF solvate)\", \"vincristine\", \"Torkinib\", \"Vandetanib\", \"Temuterkib\", \"Tirabrutinib (hydrochloride)\", \"Ipatasertib\", \"Tomivosertib\", \"RMC-6236\", \"Tazarotene\"])\n", "test_moas = set([\"Glasdegib\" , \"Sonidegib\", \"Vismodegib\"])\n", "test_cell_lines = set([\"CVCL_0397\", \"CVCL_1239\", \"CVCL_0371\", \"CVCL_1716\", \"CVCL_C466\", \"CVCL_1666\", \"CVCL_0293\"])\n", "test_entire_cell_line = set([\"CVCL_1094\"])\n", "test_plates = set([\"14\"])\n", "\n", "df = single_dose_pb_adata.obs.copy()\n", "\n", "# Validation set: compound, MOA, or cell-line match\n", "val_mask = (\n", " df['drug'].isin(val_compounds) |\n", " df['drug'].isin(val_moas) |\n", " df['Cell_ID_Cellosaur'].isin(val_cell_lines) |\n", " df['Cell_ID_Cellosaur'].isin(val_entire_cell_line)\n", ")\n", "val_data = df[val_mask]\n", "\n", "# Test set: compound, MOA, cell-line, or plate 14\n", "test_mask = (\n", " df['drug'].isin(test_compounds) |\n", " df['drug'].isin(test_moas) |\n", " df['Cell_ID_Cellosaur'].isin(test_cell_lines) |\n", " df['Cell_ID_Cellosaur'].isin(test_entire_cell_line) |\n", " df['plate'].astype(str).isin(test_plates)\n", ")\n", "test_data = df[test_mask]\n", "\n", "# Train set: anything that is not in val or test\n", "val_indices = val_data.index\n", "test_indices = test_data.index\n", "not_train = val_indices.union(test_indices)\n", "train_indices = list(set(df.index).difference(set(not_train)))\n", "# train_data = single_dose_pb_adata[~single_dose_pb_adata.obs.index.isin()]\n", "\n", "train_adata = single_dose_pb_adata[single_dose_pb_adata.obs_names.isin(train_indices)].copy()\n", "val_adata = single_dose_pb_adata[single_dose_pb_adata.obs_names.isin(val_indices)].copy()\n", "test_adata = single_dose_pb_adata[single_dose_pb_adata.obs_names.isin(test_indices)].copy()\n" ] }, { "cell_type": "code", "execution_count": 144, "id": "d40b97cf-ad6f-4258-a7ea-df47c42e70a2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5446, 9138, 12681)" ] }, "execution_count": 144, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(val_indices), len(test_indices), len(train_indices)" ] }, { "cell_type": "code", "execution_count": 154, "id": "1ef10d03-8c0b-4c5b-9d97-b79aed7746f8", "metadata": {}, "outputs": [], "source": [ "train_adata = single_dose_pb_adata[single_dose_pb_adata.obs_names.isin(train_indices)].copy()\n", "val_adata = single_dose_pb_adata[single_dose_pb_adata.obs_names.isin(val_indices)].copy()\n", "test_adata = single_dose_pb_adata[single_dose_pb_adata.obs_names.isin(test_indices)].copy()" ] }, { "cell_type": "code", "execution_count": 155, "id": "23e0e256-2265-480a-87f7-417d883d5011", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(AnnData object with n_obs × n_vars = 12681 × 62710\n", " obs: 'sample', 'drugname_drugconc', 'drug', 'n_cells', 'tscp_count', 'plate', 'Cell_Name_Vevo', 'Cell_ID_Cellosaur'\n", " var: 'gene_id', 'genome',\n", " AnnData object with n_obs × n_vars = 5446 × 62710\n", " obs: 'sample', 'drugname_drugconc', 'drug', 'n_cells', 'tscp_count', 'plate', 'Cell_Name_Vevo', 'Cell_ID_Cellosaur'\n", " var: 'gene_id', 'genome',\n", " AnnData object with n_obs × n_vars = 9138 × 62710\n", " obs: 'sample', 'drugname_drugconc', 'drug', 'n_cells', 'tscp_count', 'plate', 'Cell_Name_Vevo', 'Cell_ID_Cellosaur'\n", " var: 'gene_id', 'genome')" ] }, "execution_count": 155, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_adata, val_adata, test_adata" ] }, { "cell_type": "code", "execution_count": 158, "id": "05e7071b-4060-4182-b1c6-b3a95da431d4", "metadata": {}, "outputs": [], "source": [ "train_adata.write_h5ad('./pseudobulk/train.h5ad')\n", "val_adata.write_h5ad('./pseudobulk/val.h5ad')\n", "test_adata.write_h5ad('./pseudobulk/test.h5ad')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "22755212-4bcd-4156-b8db-5f4078fd8e5e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "175a4025-ed09-43cf-9033-896da440ca28", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "drugs = pd.read_csv(\"drugs.csv\").values.flatten().tolist()\n", "\n", "combinations = []\n", "\n", "for c in Xs[0].obs[\"cell_line\"].unique():\n", " if c not in [\"CVCL_1577\", \"CVCL_1571\", \"CVCL_1531\", \"CVCL_2651\"]:\n", " for d in drugs:\n", " combinations.append((d, c))\n", "\n", "print(combinations)" ] }, { "cell_type": "code", "execution_count": null, "id": "a72b438a-a165-4806-b71e-3e0518ded6c5", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "0697990c-b43f-4b29-8fc9-eb28fe402df9", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "44494151-02c3-463f-a7ea-49df1245fa49", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }