Buckets:

fuxinyu
/

Geneformer-bucket

Files

xet

fuxinyu/Geneformer-bucket / examples /distributed_multitask_cell_classification.ipynb

fuxinyu

4 days ago

download

raw

5.11 kB

	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "b3266a7b",
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import torch\n",
	"from geneformer import MTLClassifier"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "3e12ac9f",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Define paths\n",
	"pretrained_path = \"/path/to/pretrained/Geneformer/model\" \n",
	"# input data is tokenized rank value encodings generated by Geneformer tokenizer (see tokenizing_scRNAseq_data.ipynb)\n",
	"train_path = \"/path/to/train/data.dataset\"\n",
	"val_path = \"/path/to/val/data.dataset\"\n",
	"test_path = \"/path/to/test/data.dataset\"\n",
	"results_dir = \"/path/to/results/directory\"\n",
	"model_save_path = \"/path/to/model/save/path\"\n",
	"tensorboard_log_dir = \"/path/to/tensorboard/log/dir\"\n",
	"\n",
	"# Define tasks and hyperparameters\n",
	"# task_columns should be a list of column names from your dataset\n",
	"# Each column represents a specific classification task (e.g. cell type, disease state)\n",
	"task_columns = [\"cell_type\", \"disease_state\"] # Example task columns"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "c9bd7562",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Check GPU environment\n",
	"num_gpus = torch.cuda.device_count()\n",
	"use_distributed = num_gpus > 1\n",
	"print(f\"Number of GPUs detected: {num_gpus}\")\n",
	"print(f\"Using distributed training: {use_distributed}\")\n",
	"\n",
	"# Set environment variables for distributed training when multiple GPUs are available\n",
	"if use_distributed:\n",
	" os.environ[\"MASTER_ADDR\"] = \"localhost\" # hostname\n",
	" os.environ[\"MASTER_PORT\"] = \"12355\" # Choose an available port\n",
	" print(\"Distributed environment variables set.\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "b6ff3618",
	"metadata": {},
	"outputs": [],
	"source": [
	"#Define Hyperparameters for Optimization\n",
	"hyperparameters = {\n",
	" \"learning_rate\": {\"type\": \"float\", \"low\": 1e-5, \"high\": 1e-3, \"log\": True},\n",
	" \"warmup_ratio\": {\"type\": \"float\", \"low\": 0.005, \"high\": 0.01},\n",
	" \"weight_decay\": {\"type\": \"float\", \"low\": 0.01, \"high\": 0.1},\n",
	" \"dropout_rate\": {\"type\": \"float\", \"low\": 0.0, \"high\": 0.7},\n",
	" \"lr_scheduler_type\": {\"type\": \"categorical\", \"choices\": [\"cosine\"]},\n",
	" \"task_weights\": {\"type\": \"float\", \"low\": 0.1, \"high\": 2.0},\n",
	"}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "f665c5a7",
	"metadata": {},
	"outputs": [],
	"source": [
	"mc = MTLClassifier(\n",
	" task_columns=task_columns, # Our defined classification tasks\n",
	" study_name=\"MTLClassifier_distributed\",\n",
	" pretrained_path=pretrained_path,\n",
	" train_path=train_path,\n",
	" val_path=val_path,\n",
	" test_path=test_path,\n",
	" model_save_path=model_save_path,\n",
	" results_dir=results_dir,\n",
	" tensorboard_log_dir=tensorboard_log_dir,\n",
	" hyperparameters=hyperparameters,\n",
	" # Distributed training parameters\n",
	" distributed_training=use_distributed, # Enable distributed training if multiple GPUs available\n",
	" master_addr=\"localhost\" if use_distributed else None,\n",
	" master_port=\"12355\" if use_distributed else None,\n",
	" # Other training parameters\n",
	" n_trials=15, # Number of trials for hyperparameter optimization\n",
	" epochs=1, # Number of training epochs (1 suggested to prevent overfitting)\n",
	" batch_size=8, # Adjust based on available GPU memory\n",
	" gradient_accumulation_steps=4, # Accumulate gradients over multiple steps\n",
	" gradient_clipping=True, # Enable gradient clipping for stability\n",
	" max_grad_norm=1.0, # Set maximum gradient norm\n",
	" seed=42\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "f69f7b6a",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Run Hyperparameter Optimization with Distributed Training\n",
	"if __name__ == \"__main__\":\n",
	" # This guard is required for distributed training to prevent\n",
	" # infinite subprocess spawning when using torch.multiprocessing\n",
	" mc.run_optuna_study()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "3affd5dd",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Evaluate the Model on Test Data\n",
	"if __name__ == \"__main__\":\n",
	" mc.load_and_evaluate_test_model()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "bio",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"name": "python",
	"version": "3.12.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}

Xet Storage Details

Size:: 5.11 kB
Xet hash:: a09d6199c263b0a4d8d69c1e59557277096c014805745d08c48918f759a53070

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.