Bug fixes in stats and added gene dict
#12
by
davidjwen
- opened
This view is limited to 50 files because it contains too many changes.
See the raw diff here.
- .gitattributes +2 -3
- .pre-commit-config.yaml +0 -26
- .readthedocs.yaml +0 -19
- Geneformer-V1-10M/config.json +0 -23
- Geneformer-V1-10M/model.safetensors +0 -3
- Geneformer-V1-10M/training_args.bin +0 -3
- Geneformer-V2-104M/config.json +0 -24
- Geneformer-V2-104M/generation_config.json +0 -5
- Geneformer-V2-104M/model.safetensors +0 -3
- Geneformer-V2-104M/training_args.bin +0 -3
- Geneformer-V2-104M_CLcancer/config.json +0 -25
- Geneformer-V2-104M_CLcancer/generation_config.json +0 -5
- Geneformer-V2-104M_CLcancer/model.safetensors +0 -3
- Geneformer-V2-104M_CLcancer/training_args.bin +0 -3
- Geneformer-V2-316M/config.json +0 -24
- Geneformer-V2-316M/generation_config.json +0 -5
- Geneformer-V2-316M/model.safetensors +0 -3
- Geneformer-V2-316M/training_args.bin +0 -3
- MANIFEST.in +2 -9
- README.md +8 -69
- benchmarking/castle_cell_type_annotation.r +80 -0
- benchmarking/prepare_datasplits_for_cell_type_annotation.ipynb +288 -0
- benchmarking/randomForest_token_classifier_dosageTF_10k.ipynb +0 -0
- benchmarking/scDeepsort_train_predict.ipynb +166 -0
- config.json +10 -11
- docs/Makefile +0 -20
- docs/make.bat +0 -35
- docs/requirements.txt +0 -3
- docs/source/_static/css/custom.css +0 -40
- docs/source/_static/gf_logo.png +0 -0
- docs/source/about.rst +0 -49
- docs/source/api.rst +0 -51
- docs/source/conf.py +0 -80
- docs/source/geneformer.classifier.rst +0 -10
- docs/source/geneformer.emb_extractor.rst +0 -26
- docs/source/geneformer.in_silico_perturber.rst +0 -8
- docs/source/geneformer.in_silico_perturber_stats.rst +0 -25
- docs/source/geneformer.mtl_classifier.rst +0 -11
- docs/source/geneformer.tokenizer.rst +0 -15
- docs/source/getstarted.rst +0 -36
- docs/source/index.rst +0 -16
- examples/cell_classification.ipynb +0 -0
- examples/distributed_multitask_cell_classification.ipynb +0 -149
- examples/example_input_files/bivalent_promoters/bivalent_gene_labels.txt +107 -0
- examples/example_input_files/bivalent_promoters/lys4_only_gene_labels.txt +80 -0
- examples/example_input_files/bivalent_promoters/no_methylation_gene_labels.txt +42 -0
- examples/example_input_files/dosage_sensitive_tfs/dosage_sens_tf_labels.csv +369 -0
- examples/example_input_files/gene_info_table.csv +0 -0
- examples/extract_and_plot_cell_embeddings.ipynb +0 -0
- examples/gene_classification.ipynb +0 -0
.gitattributes
CHANGED
|
@@ -14,11 +14,10 @@
|
|
| 14 |
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 15 |
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 16 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 18 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 19 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 20 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 22 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 23 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 24 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
|
@@ -26,4 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 26 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 28 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
|
|
|
|
| 14 |
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 15 |
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 16 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 17 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 18 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 19 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 21 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 22 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 23 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
geneformer/gene_name_id_dict.pkl filter=lfs diff=lfs merge=lfs -text
|
.pre-commit-config.yaml
DELETED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
# See https://pre-commit.com for more information
|
| 2 |
-
# See https://pre-commit.com/hooks.html for more hooks
|
| 3 |
-
repos:
|
| 4 |
-
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 5 |
-
rev: v3.2.0
|
| 6 |
-
hooks:
|
| 7 |
-
- id: trailing-whitespace
|
| 8 |
-
- id: end-of-file-fixer
|
| 9 |
-
- id: check-yaml
|
| 10 |
-
- id: check-added-large-files
|
| 11 |
-
- id: check-merge-conflict
|
| 12 |
-
- id: mixed-line-ending
|
| 13 |
-
- id: check-docstring-first
|
| 14 |
-
- repo: https://github.com/pycqa/isort
|
| 15 |
-
rev: 5.12.0
|
| 16 |
-
hooks:
|
| 17 |
-
- id: isort
|
| 18 |
-
args: ["--profile", "black"]
|
| 19 |
-
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 20 |
-
# Ruff version.
|
| 21 |
-
rev: v0.1.4
|
| 22 |
-
hooks:
|
| 23 |
-
# Run the Ruff linter.
|
| 24 |
-
- id: ruff
|
| 25 |
-
# Run the Ruff formatter.
|
| 26 |
-
- id: ruff-format
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.readthedocs.yaml
DELETED
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
# Read the Docs configuration file
|
| 2 |
-
|
| 3 |
-
# Required
|
| 4 |
-
version: 2
|
| 5 |
-
|
| 6 |
-
# Set the OS, Python version and other tools you might need
|
| 7 |
-
build:
|
| 8 |
-
os: ubuntu-22.04
|
| 9 |
-
tools:
|
| 10 |
-
python: "3.10"
|
| 11 |
-
|
| 12 |
-
# Build documentation in the "docs/" directory with Sphinx
|
| 13 |
-
sphinx:
|
| 14 |
-
configuration: docs/source/conf.py
|
| 15 |
-
|
| 16 |
-
# Python requirements required build your documentation
|
| 17 |
-
python:
|
| 18 |
-
install:
|
| 19 |
-
- requirements: docs/requirements.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V1-10M/config.json
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"architectures": [
|
| 3 |
-
"BertForMaskedLM"
|
| 4 |
-
],
|
| 5 |
-
"attention_probs_dropout_prob": 0.02,
|
| 6 |
-
"gradient_checkpointing": false,
|
| 7 |
-
"hidden_act": "relu",
|
| 8 |
-
"hidden_dropout_prob": 0.02,
|
| 9 |
-
"hidden_size": 256,
|
| 10 |
-
"initializer_range": 0.02,
|
| 11 |
-
"intermediate_size": 512,
|
| 12 |
-
"layer_norm_eps": 1e-12,
|
| 13 |
-
"max_position_embeddings": 2048,
|
| 14 |
-
"model_type": "bert",
|
| 15 |
-
"num_attention_heads": 4,
|
| 16 |
-
"num_hidden_layers": 6,
|
| 17 |
-
"pad_token_id": 0,
|
| 18 |
-
"position_embedding_type": "absolute",
|
| 19 |
-
"transformers_version": "4.6.0",
|
| 20 |
-
"type_vocab_size": 2,
|
| 21 |
-
"use_cache": true,
|
| 22 |
-
"vocab_size": 25426
|
| 23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V1-10M/model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:a5e33a757431643b3697de7ef6127950cdc49e06e58d4266b3a3ab191b683f14
|
| 3 |
-
size 41183536
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V1-10M/training_args.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f0ec3459454205174c9d2e4d6c6930f6b0fbf3364fc03a6f4d99c4d3add2012b
|
| 3 |
-
size 2607
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V2-104M/config.json
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"architectures": [
|
| 3 |
-
"BertForMaskedLM"
|
| 4 |
-
],
|
| 5 |
-
"attention_probs_dropout_prob": 0.1,
|
| 6 |
-
"classifier_dropout": null,
|
| 7 |
-
"hidden_act": "relu",
|
| 8 |
-
"hidden_dropout_prob": 0.1,
|
| 9 |
-
"hidden_size": 768,
|
| 10 |
-
"initializer_range": 0.02,
|
| 11 |
-
"intermediate_size": 3072,
|
| 12 |
-
"layer_norm_eps": 1e-12,
|
| 13 |
-
"max_position_embeddings": 4096,
|
| 14 |
-
"model_type": "bert",
|
| 15 |
-
"num_attention_heads": 12,
|
| 16 |
-
"num_hidden_layers": 12,
|
| 17 |
-
"pad_token_id": 0,
|
| 18 |
-
"position_embedding_type": "absolute",
|
| 19 |
-
"torch_dtype": "float32",
|
| 20 |
-
"transformers_version": "4.44.2",
|
| 21 |
-
"type_vocab_size": 2,
|
| 22 |
-
"use_cache": true,
|
| 23 |
-
"vocab_size": 20275
|
| 24 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V2-104M/generation_config.json
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_from_model_config": true,
|
| 3 |
-
"pad_token_id": 0,
|
| 4 |
-
"transformers_version": "4.44.2"
|
| 5 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V2-104M/model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:fff5cba29ddd8792991fa77b4872246fbe548a178cebda3775cdc72b67780e7f
|
| 3 |
-
size 417571156
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V2-104M/training_args.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:0d8ddd9e4f35b5fe23a3adaae03aa4480705ca82eed546a488f970adb3752d9d
|
| 3 |
-
size 5496
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V2-104M_CLcancer/config.json
DELETED
|
@@ -1,25 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_name_or_path": "/gladstone/theodoris/lab/ctheodoris/gf-104m/models/241127_143148_geneformer_94M_L12_emb768_SL4096_E3_B18_LR0.0002_LScosine_WR0.007_Oadamw_DS13/models",
|
| 3 |
-
"architectures": [
|
| 4 |
-
"BertForMaskedLM"
|
| 5 |
-
],
|
| 6 |
-
"attention_probs_dropout_prob": 0.1,
|
| 7 |
-
"classifier_dropout": null,
|
| 8 |
-
"hidden_act": "relu",
|
| 9 |
-
"hidden_dropout_prob": 0.1,
|
| 10 |
-
"hidden_size": 768,
|
| 11 |
-
"initializer_range": 0.02,
|
| 12 |
-
"intermediate_size": 3072,
|
| 13 |
-
"layer_norm_eps": 1e-12,
|
| 14 |
-
"max_position_embeddings": 4096,
|
| 15 |
-
"model_type": "bert",
|
| 16 |
-
"num_attention_heads": 12,
|
| 17 |
-
"num_hidden_layers": 12,
|
| 18 |
-
"pad_token_id": 0,
|
| 19 |
-
"position_embedding_type": "absolute",
|
| 20 |
-
"torch_dtype": "float32",
|
| 21 |
-
"transformers_version": "4.37.1",
|
| 22 |
-
"type_vocab_size": 2,
|
| 23 |
-
"use_cache": true,
|
| 24 |
-
"vocab_size": 20275
|
| 25 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V2-104M_CLcancer/generation_config.json
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_from_model_config": true,
|
| 3 |
-
"pad_token_id": 0,
|
| 4 |
-
"transformers_version": "4.37.1"
|
| 5 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V2-104M_CLcancer/model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:827738139bfed4bafa9d1f3df7c6146da2e3b85f7225076adc32c6eda0ba4357
|
| 3 |
-
size 417571156
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V2-104M_CLcancer/training_args.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:8cf8ce52b498253adc6df53197a99821fa145c19b8ae5eeb8d15be76b8b7ddb3
|
| 3 |
-
size 4984
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V2-316M/config.json
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"architectures": [
|
| 3 |
-
"BertForMaskedLM"
|
| 4 |
-
],
|
| 5 |
-
"attention_probs_dropout_prob": 0.1,
|
| 6 |
-
"classifier_dropout": null,
|
| 7 |
-
"hidden_act": "relu",
|
| 8 |
-
"hidden_dropout_prob": 0.1,
|
| 9 |
-
"hidden_size": 1152,
|
| 10 |
-
"initializer_range": 0.02,
|
| 11 |
-
"intermediate_size": 4608,
|
| 12 |
-
"layer_norm_eps": 1e-12,
|
| 13 |
-
"max_position_embeddings": 4096,
|
| 14 |
-
"model_type": "bert",
|
| 15 |
-
"num_attention_heads": 18,
|
| 16 |
-
"num_hidden_layers": 18,
|
| 17 |
-
"pad_token_id": 0,
|
| 18 |
-
"position_embedding_type": "absolute",
|
| 19 |
-
"torch_dtype": "float32",
|
| 20 |
-
"transformers_version": "4.44.2",
|
| 21 |
-
"type_vocab_size": 2,
|
| 22 |
-
"use_cache": true,
|
| 23 |
-
"vocab_size": 20275
|
| 24 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V2-316M/generation_config.json
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_from_model_config": true,
|
| 3 |
-
"pad_token_id": 0,
|
| 4 |
-
"transformers_version": "4.44.2"
|
| 5 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V2-316M/model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:965ceccea81953d362081ef3843560a0e4fef88d396c28017881f1e94b1246f3
|
| 3 |
-
size 1265455076
|
|
|
|
|
|
|
|
|
|
|
|
Geneformer-V2-316M/training_args.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e45150f9a4ca34cb4e91ce79f65f3d99d9d66df9f66a37517a352d291008e0b8
|
| 3 |
-
size 5432
|
|
|
|
|
|
|
|
|
|
|
|
MANIFEST.in
CHANGED
|
@@ -1,9 +1,2 @@
|
|
| 1 |
-
include geneformer/
|
| 2 |
-
include geneformer/
|
| 3 |
-
include geneformer/ensembl_mapping_dict_gc104M.pkl
|
| 4 |
-
include geneformer/token_dictionary_gc104M.pkl
|
| 5 |
-
|
| 6 |
-
include geneformer/gene_dictionaries_30m/gene_median_dictionary_gc30M.pkl
|
| 7 |
-
include geneformer/gene_dictionaries_30m/gene_name_id_dict_gc30M.pkl
|
| 8 |
-
include geneformer/gene_dictionaries_30m/ensembl_mapping_dict_gc30M.pkl
|
| 9 |
-
include geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl
|
|
|
|
| 1 |
+
include geneformer/gene_median_dictionary.pkl
|
| 2 |
+
include geneformer/token_dictionary.pkl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,91 +1,30 @@
|
|
| 1 |
---
|
| 2 |
datasets: ctheodoris/Genecorpus-30M
|
| 3 |
-
license: apache-2.0
|
| 4 |
-
tags:
|
| 5 |
-
- single-cell
|
| 6 |
-
- genomics
|
| 7 |
---
|
| 8 |
# Geneformer
|
| 9 |
-
Geneformer is a
|
| 10 |
|
| 11 |
-
|
| 12 |
-
- See [our manuscript](https://www.biorxiv.org/content/10.1101/2024.08.16.608180v1.full.pdf) for details of the expanded model, now trained on ~104 million transcriptomes, and our continual learning, multitask learning, and quantization strategies.
|
| 13 |
-
- See [geneformer.readthedocs.io](https://geneformer.readthedocs.io) for documentation.
|
| 14 |
|
| 15 |
# Model Description
|
| 16 |
-
Geneformer is a
|
| 17 |
|
| 18 |
-
|
| 19 |
|
| 20 |
-
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
During pretraining, Geneformer gained a fundamental understanding of network dynamics, encoding network hierarchy in the model’s attention weights in a completely self-supervised manner. With both zero-shot learning and fine-tuning with limited task-specific data, Geneformer consistently boosted predictive accuracy in a diverse panel of downstream tasks relevant to chromatin and network dynamics. In silico perturbation with zero-shot learning identified a novel transcription factor in cardiomyocytes that we experimentally validated to be critical to their ability to generate contractile force. In silico treatment with limited patient data revealed candidate therapeutic targets for cardiomyopathy that we experimentally validated to significantly improve the ability of cardiomyocytes to generate contractile force in an induced pluripotent stem cell (iPSC) model of the disease. Overall, Geneformer represents a foundational AI model pretrained on a large-scale corpus human single cell transcriptomes to gain a fundamental understanding of gene network dynamics that can now be democratized to a vast array of downstream tasks to accelerate discovery of key network regulators and candidate therapeutic targets.
|
| 25 |
-
|
| 26 |
-
The repository includes the following pretrained models:
|
| 27 |
-
|
| 28 |
-
- Geneformer-V1-10M: original model trained June 2021 on ~30M human single cell transcriptomes, 10M parameters, input size 2048, vocabulary ~25K protein-coding or non-coding RNA genes
|
| 29 |
-
- Geneformer-V2-104M and Geneformer-V2-316M: updated model trained Dec 2024 on ~104M human single cell transcriptomes, 104M or 316M parameters, input size 4096, vocabulary ~20K protein-coding genes
|
| 30 |
-
|
| 31 |
-
The current default model in the main directory of the repository is Geneformer-V2-316M.
|
| 32 |
-
|
| 33 |
-
The repository also contains fined tuned models in the fine_tuned_models directory and the cancer-tuned model following continual learning on ~14 million cancer cells, Geneformer-V2-104M_CLcancer.
|
| 34 |
|
| 35 |
# Application
|
| 36 |
The pretrained Geneformer model can be used directly for zero-shot learning, for example for in silico perturbation analysis, or by fine-tuning towards the relevant downstream task, such as gene or cell state classification.
|
| 37 |
|
| 38 |
-
Example applications demonstrated in [our manuscript](https://rdcu.be/ddrx0) include:
|
| 39 |
-
|
| 40 |
-
*Fine-tuning*:
|
| 41 |
-
- transcription factor dosage sensitivity
|
| 42 |
-
- chromatin dynamics (bivalently marked promoters)
|
| 43 |
-
- transcription factor regulatory range
|
| 44 |
-
- gene network centrality
|
| 45 |
-
- transcription factor targets
|
| 46 |
-
- cell type annotation
|
| 47 |
-
- batch integration
|
| 48 |
-
- cell state classification across differentiation
|
| 49 |
-
- disease classification
|
| 50 |
-
- in silico perturbation to determine disease-driving genes
|
| 51 |
-
- in silico treatment to determine candidate therapeutic targets
|
| 52 |
-
|
| 53 |
-
*Zero-shot learning*:
|
| 54 |
-
- batch integration
|
| 55 |
-
- gene context specificity
|
| 56 |
-
- in silico reprogramming
|
| 57 |
-
- in silico differentiation
|
| 58 |
-
- in silico perturbation to determine impact on cell state
|
| 59 |
-
- in silico perturbation to determine transcription factor targets
|
| 60 |
-
- in silico perturbation to determine transcription factor cooperativity
|
| 61 |
-
|
| 62 |
# Installation
|
| 63 |
-
In addition to the pretrained model, contained herein are functions for tokenizing and collating data specific to single cell transcriptomics, pretraining the model,
|
| 64 |
|
| 65 |
```bash
|
| 66 |
-
# Make sure you have git-lfs installed (https://git-lfs.com)
|
| 67 |
-
git lfs install
|
| 68 |
git clone https://huggingface.co/ctheodoris/Geneformer
|
| 69 |
cd Geneformer
|
| 70 |
pip install .
|
| 71 |
```
|
| 72 |
|
| 73 |
-
For usage, see [examples](https://huggingface.co/ctheodoris/Geneformer/tree/main/examples) for
|
| 74 |
-
- tokenizing transcriptomes
|
| 75 |
-
- pretraining
|
| 76 |
-
- hyperparameter tuning
|
| 77 |
-
- fine-tuning
|
| 78 |
-
- extracting and plotting cell embeddings
|
| 79 |
-
- in silico perturbation
|
| 80 |
-
|
| 81 |
-
Please also see [here](https://tinyurl.com/geneformertutorial) for a quickstart tutorial for predicting candidate therapeutic targets with Geneformer.
|
| 82 |
-
|
| 83 |
-
Complete documentation is available at https://geneformer.readthedocs.io/en/latest/.
|
| 84 |
-
|
| 85 |
-
Please note that the fine-tuning examples are meant to be generally applicable and the input datasets and labels will vary dependent on the downstream task. Example input files for a few of the downstream tasks demonstrated in the manuscript are located within the [example_input_files directory](https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files) in the dataset repository, but these only represent a few example fine-tuning applications.
|
| 86 |
-
|
| 87 |
-
Please note that GPU resources are required for efficient usage of Geneformer. Additionally, we strongly recommend tuning hyperparameters for each downstream fine-tuning application as this can significantly boost predictive potential in the downstream task (e.g. max learning rate, learning schedule, number of layers to freeze, etc.). Importantly, as usual for deep learning models, there are no uniformly applicable default hyperparameters for Geneformer.
|
| 88 |
-
|
| 89 |
-
# Citations
|
| 90 |
-
- C V Theodoris#, L Xiao, A Chopra, M D Chaffin, Z R Al Sayed, M C Hill, H Mantineo, E Brydon, Z Zeng, X S Liu, P T Ellinor#. Transfer learning enables predictions in network biology. _**Nature**_, 31 May 2023. (#co-corresponding authors)
|
| 91 |
-
- H Chen*, M S Venkatesh*, J Gomez Ortega, S V Mahesh, T Nandi, R Madduri, K Pelka†, C V Theodoris†#. Quantized multi-task learning for context-specific representations of gene network dynamics. _**bioRxiv**_, 19 Aug 2024. (*co-first authors, †co-senior authors, #corresponding author)
|
|
|
|
| 1 |
---
|
| 2 |
datasets: ctheodoris/Genecorpus-30M
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
---
|
| 4 |
# Geneformer
|
| 5 |
+
Geneformer is a foundation transformer model pretrained on a large-scale corpus of ~30 million single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology.
|
| 6 |
|
| 7 |
+
See [our manuscript](https://www.nature.com/articles/s41586-023-06139-9) for details.
|
|
|
|
|
|
|
| 8 |
|
| 9 |
# Model Description
|
| 10 |
+
Geneformer is a foundation transformer model pretrained on [Genecorpus-30M](https://huggingface.co/datasets/ctheodoris/Genecorpus-30M), a pretraining corpus comprised of ~30 million single cell transcriptomes from a broad range of human tissues. Each single cell’s transcriptome is presented to the model as a rank value encoding where genes are ranked by their expression in that cell normalized by their expression across the entire Genecorpus-30M. The rank value encoding provides a nonparametric representation of that cell’s transcriptome and takes advantage of the many observations of each gene’s expression across Genecorpus-30M to prioritize genes that distinguish cell state. Specifically, this method will deprioritize ubiquitously highly-expressed housekeeping genes by normalizing them to a lower rank. Conversely, genes such as transcription factors that may be lowly expressed when they are expressed but highly distinguish cell state will move to a higher rank within the encoding. Furthermore, this rank-based approach may be more robust against technical artifacts that may systematically bias the absolute transcript counts value while the overall relative ranking of genes within each cell remains more stable.
|
| 11 |
|
| 12 |
+
The rank value encoding of each single cell’s transcriptome then proceeds through six transformer encoder units. Pretraining was accomplished using a masked learning objective where 15% of the genes within each transcriptome were masked and the model was trained to predict which gene should be within each masked position in that specific cell state using the context of the remaining unmasked genes. A major strength of this approach is that it is entirely self-supervised and can be accomplished on completely unlabeled data, which allows the inclusion of large amounts of training data without being restricted to samples with accompanying labels.
|
| 13 |
|
| 14 |
+
We detail applications and results in [our manuscript](https://www.nature.com/articles/s41586-023-06139-9).
|
| 15 |
|
| 16 |
+
During pretraining, Geneformer gained a fundamental understanding of network dynamics, encoding network hierarchy in the model’s attention weights in a completely self-supervised manner. Fine-tuning Geneformer towards a diverse panel of downstream tasks relevant to chromatin and network dynamics using limited task-specific data demonstrated that Geneformer consistently boosted predictive accuracy. Applied to disease modeling with limited patient data, Geneformer identified candidate therapeutic targets. Overall, Geneformer represents an invaluable pretrained model from which fine-tuning towards a broad range of downstream applications can be pursued to accelerate discovery of key network regulators and candidate therapeutic targets.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# Application
|
| 19 |
The pretrained Geneformer model can be used directly for zero-shot learning, for example for in silico perturbation analysis, or by fine-tuning towards the relevant downstream task, such as gene or cell state classification.
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
# Installation
|
| 22 |
+
In addition to the pretrained model, contained herein are functions for tokenizing and collating data specific to single cell transcriptomics, pretraining the model, and performing in silico pertrubation with either the pretrained or fine-tuned models. To install:
|
| 23 |
|
| 24 |
```bash
|
|
|
|
|
|
|
| 25 |
git clone https://huggingface.co/ctheodoris/Geneformer
|
| 26 |
cd Geneformer
|
| 27 |
pip install .
|
| 28 |
```
|
| 29 |
|
| 30 |
+
For usage, see [examples](https://huggingface.co/ctheodoris/Geneformer/tree/main/examples) for pretraining and fine-tuning. Please note that GPU resources are required for efficient usage of Geneformer.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarking/castle_cell_type_annotation.r
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Usage: Rscript castle_cell_type_annotation.r organ
|
| 2 |
+
|
| 3 |
+
# parse ordered arguments
|
| 4 |
+
args <- commandArgs(trailingOnly=TRUE)
|
| 5 |
+
organ <- args[1]
|
| 6 |
+
|
| 7 |
+
suppressPackageStartupMessages(library(scater))
|
| 8 |
+
suppressPackageStartupMessages(library(xgboost))
|
| 9 |
+
suppressPackageStartupMessages(library(igraph))
|
| 10 |
+
BREAKS=c(-1, 0, 1, 6, Inf)
|
| 11 |
+
nFeatures = 100
|
| 12 |
+
|
| 13 |
+
print(paste("Training ", organ, sep=""))
|
| 14 |
+
|
| 15 |
+
# import training and test data
|
| 16 |
+
rootdir="/path/to/data/"
|
| 17 |
+
train_counts <- t(as.matrix(read.csv(file = paste(rootdir, organ, "_filtered_data_train.csv", sep=""), row.names = 1)))
|
| 18 |
+
test_counts <- t(as.matrix(read.csv(file = paste(rootdir, organ, "_filtered_data_test.csv", sep=""), row.names = 1)))
|
| 19 |
+
train_celltype <- as.matrix(read.csv(file = paste(rootdir, organ, "_filtered_celltype_train.csv", sep="")))
|
| 20 |
+
test_celltype <- as.matrix(read.csv(file = paste(rootdir, organ, "_filtered_celltype_test.csv", sep="")))
|
| 21 |
+
|
| 22 |
+
# select features
|
| 23 |
+
sourceCellTypes = as.factor(train_celltype[,"Cell_type"])
|
| 24 |
+
ds = rbind(train_counts,test_counts)
|
| 25 |
+
ds[is.na(ds)] <- 0
|
| 26 |
+
isSource = c(rep(TRUE,nrow(train_counts)), rep(FALSE,nrow(test_counts)))
|
| 27 |
+
topFeaturesAvg = colnames(ds[isSource,])[order(apply(ds[isSource,], 2, mean), decreasing = T)]
|
| 28 |
+
topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),sourceCellTypes,method = "nmi") }), decreasing = T))
|
| 29 |
+
selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) )
|
| 30 |
+
tmp = cor(ds[isSource,selectedFeatures], method = "pearson")
|
| 31 |
+
tmp[!lower.tri(tmp)] = 0
|
| 32 |
+
selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))]
|
| 33 |
+
remove(tmp)
|
| 34 |
+
|
| 35 |
+
# bin expression values and expand features by bins
|
| 36 |
+
dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS)
|
| 37 |
+
nUniq = apply(dsBins, 2, function(x) { length(unique(x)) })
|
| 38 |
+
ds = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1]))
|
| 39 |
+
remove(dsBins, nUniq)
|
| 40 |
+
|
| 41 |
+
# train model
|
| 42 |
+
train = runif(nrow(ds[isSource,]))<0.8
|
| 43 |
+
# slightly different setup for multiclass and binary classification
|
| 44 |
+
if (length(unique(sourceCellTypes)) > 2) {
|
| 45 |
+
xg=xgboost(data=ds[isSource,][train, ] ,
|
| 46 |
+
label=as.numeric(sourceCellTypes[train])-1,
|
| 47 |
+
objective="multi:softmax", num_class=length(unique(sourceCellTypes)),
|
| 48 |
+
eta=0.7 , nthread=5, nround=20, verbose=0,
|
| 49 |
+
gamma=0.001, max_depth=5, min_child_weight=10)
|
| 50 |
+
} else {
|
| 51 |
+
xg=xgboost(data=ds[isSource,][train, ] ,
|
| 52 |
+
label=as.numeric(sourceCellTypes[train])-1,
|
| 53 |
+
eta=0.7 , nthread=5, nround=20, verbose=0,
|
| 54 |
+
gamma=0.001, max_depth=5, min_child_weight=10)
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
# validate model
|
| 58 |
+
predictedClasses = predict(xg, ds[!isSource, ])
|
| 59 |
+
testCellTypes = as.factor(test_celltype[,"Cell_type"])
|
| 60 |
+
trueClasses <- as.numeric(testCellTypes)-1
|
| 61 |
+
|
| 62 |
+
cm <- as.matrix(table(Actual = trueClasses, Predicted = predictedClasses))
|
| 63 |
+
n <- sum(cm)
|
| 64 |
+
nc = nrow(cm) # number of classes
|
| 65 |
+
diag = diag(cm) # number of correctly classified instances per class
|
| 66 |
+
rowsums = apply(cm, 1, sum) # number of instances per class
|
| 67 |
+
colsums = apply(cm, 2, sum) # number of predictions per class
|
| 68 |
+
p = rowsums / n # distribution of instances over the actual classes
|
| 69 |
+
q = colsums / n # distribution of instances over the predicted classes
|
| 70 |
+
accuracy = sum(diag) / n
|
| 71 |
+
precision = diag / colsums
|
| 72 |
+
recall = diag / rowsums
|
| 73 |
+
f1 = 2 * precision * recall / (precision + recall)
|
| 74 |
+
macroF1 = mean(f1)
|
| 75 |
+
|
| 76 |
+
print(paste(organ, " accuracy: ", accuracy, sep=""))
|
| 77 |
+
print(paste(organ, " macroF1: ", macroF1, sep=""))
|
| 78 |
+
|
| 79 |
+
results_df = data.frame(Accuracy=c(accuracy),macroF1=c(macroF1))
|
| 80 |
+
write.csv(results_df,paste(rootdir, organ, "_castle_results_test.csv", sep=""), row.names = FALSE)
|
benchmarking/prepare_datasplits_for_cell_type_annotation.ipynb
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "25107132",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"### Preparing train and test data splits for cell type annotation application"
|
| 9 |
+
]
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"cell_type": "code",
|
| 13 |
+
"execution_count": 3,
|
| 14 |
+
"id": "83d8d249-affe-45dd-915e-992b4b35b31a",
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [],
|
| 17 |
+
"source": [
|
| 18 |
+
"import os\n",
|
| 19 |
+
"import pandas as pd\n",
|
| 20 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 21 |
+
"from tqdm.notebook import tqdm\n",
|
| 22 |
+
"from collections import Counter\n",
|
| 23 |
+
"import pickle"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "code",
|
| 28 |
+
"execution_count": 4,
|
| 29 |
+
"id": "e3e6a2bf-44c8-4164-9ecd-1686230ea8be",
|
| 30 |
+
"metadata": {},
|
| 31 |
+
"outputs": [
|
| 32 |
+
{
|
| 33 |
+
"data": {
|
| 34 |
+
"text/plain": [
|
| 35 |
+
"['pancreas',\n",
|
| 36 |
+
" 'liver',\n",
|
| 37 |
+
" 'blood',\n",
|
| 38 |
+
" 'lung',\n",
|
| 39 |
+
" 'spleen',\n",
|
| 40 |
+
" 'placenta',\n",
|
| 41 |
+
" 'colorectum',\n",
|
| 42 |
+
" 'kidney',\n",
|
| 43 |
+
" 'brain']"
|
| 44 |
+
]
|
| 45 |
+
},
|
| 46 |
+
"execution_count": 4,
|
| 47 |
+
"metadata": {},
|
| 48 |
+
"output_type": "execute_result"
|
| 49 |
+
}
|
| 50 |
+
],
|
| 51 |
+
"source": [
|
| 52 |
+
"rootdir = \"/path/to/data/\"\n",
|
| 53 |
+
"\n",
|
| 54 |
+
"# collect panel of tissues to test\n",
|
| 55 |
+
"dir_list = []\n",
|
| 56 |
+
"for dir_i in os.listdir(rootdir):\n",
|
| 57 |
+
" if (\"results\" not in dir_i) & (os.path.isdir(os.path.join(rootdir, dir_i))):\n",
|
| 58 |
+
" dir_list += [dir_i]\n",
|
| 59 |
+
"dir_list"
|
| 60 |
+
]
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"cell_type": "code",
|
| 64 |
+
"execution_count": 5,
|
| 65 |
+
"id": "0b205eec-a518-472a-ab90-dd63ef9803cd",
|
| 66 |
+
"metadata": {},
|
| 67 |
+
"outputs": [
|
| 68 |
+
{
|
| 69 |
+
"data": {
|
| 70 |
+
"text/html": [
|
| 71 |
+
"<div>\n",
|
| 72 |
+
"<style scoped>\n",
|
| 73 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 74 |
+
" vertical-align: middle;\n",
|
| 75 |
+
" }\n",
|
| 76 |
+
"\n",
|
| 77 |
+
" .dataframe tbody tr th {\n",
|
| 78 |
+
" vertical-align: top;\n",
|
| 79 |
+
" }\n",
|
| 80 |
+
"\n",
|
| 81 |
+
" .dataframe thead th {\n",
|
| 82 |
+
" text-align: right;\n",
|
| 83 |
+
" }\n",
|
| 84 |
+
"</style>\n",
|
| 85 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 86 |
+
" <thead>\n",
|
| 87 |
+
" <tr style=\"text-align: right;\">\n",
|
| 88 |
+
" <th></th>\n",
|
| 89 |
+
" <th>filter_pass</th>\n",
|
| 90 |
+
" <th>original_cell_id</th>\n",
|
| 91 |
+
" </tr>\n",
|
| 92 |
+
" </thead>\n",
|
| 93 |
+
" <tbody>\n",
|
| 94 |
+
" <tr>\n",
|
| 95 |
+
" <th>0</th>\n",
|
| 96 |
+
" <td>0</td>\n",
|
| 97 |
+
" <td>C_1</td>\n",
|
| 98 |
+
" </tr>\n",
|
| 99 |
+
" <tr>\n",
|
| 100 |
+
" <th>1</th>\n",
|
| 101 |
+
" <td>1</td>\n",
|
| 102 |
+
" <td>C_2</td>\n",
|
| 103 |
+
" </tr>\n",
|
| 104 |
+
" <tr>\n",
|
| 105 |
+
" <th>2</th>\n",
|
| 106 |
+
" <td>0</td>\n",
|
| 107 |
+
" <td>C_3</td>\n",
|
| 108 |
+
" </tr>\n",
|
| 109 |
+
" <tr>\n",
|
| 110 |
+
" <th>3</th>\n",
|
| 111 |
+
" <td>1</td>\n",
|
| 112 |
+
" <td>C_4</td>\n",
|
| 113 |
+
" </tr>\n",
|
| 114 |
+
" <tr>\n",
|
| 115 |
+
" <th>4</th>\n",
|
| 116 |
+
" <td>0</td>\n",
|
| 117 |
+
" <td>C_5</td>\n",
|
| 118 |
+
" </tr>\n",
|
| 119 |
+
" <tr>\n",
|
| 120 |
+
" <th>...</th>\n",
|
| 121 |
+
" <td>...</td>\n",
|
| 122 |
+
" <td>...</td>\n",
|
| 123 |
+
" </tr>\n",
|
| 124 |
+
" <tr>\n",
|
| 125 |
+
" <th>9590</th>\n",
|
| 126 |
+
" <td>1</td>\n",
|
| 127 |
+
" <td>C_9591</td>\n",
|
| 128 |
+
" </tr>\n",
|
| 129 |
+
" <tr>\n",
|
| 130 |
+
" <th>9591</th>\n",
|
| 131 |
+
" <td>1</td>\n",
|
| 132 |
+
" <td>C_9592</td>\n",
|
| 133 |
+
" </tr>\n",
|
| 134 |
+
" <tr>\n",
|
| 135 |
+
" <th>9592</th>\n",
|
| 136 |
+
" <td>1</td>\n",
|
| 137 |
+
" <td>C_9593</td>\n",
|
| 138 |
+
" </tr>\n",
|
| 139 |
+
" <tr>\n",
|
| 140 |
+
" <th>9593</th>\n",
|
| 141 |
+
" <td>1</td>\n",
|
| 142 |
+
" <td>C_9594</td>\n",
|
| 143 |
+
" </tr>\n",
|
| 144 |
+
" <tr>\n",
|
| 145 |
+
" <th>9594</th>\n",
|
| 146 |
+
" <td>1</td>\n",
|
| 147 |
+
" <td>C_9595</td>\n",
|
| 148 |
+
" </tr>\n",
|
| 149 |
+
" </tbody>\n",
|
| 150 |
+
"</table>\n",
|
| 151 |
+
"<p>9595 rows × 2 columns</p>\n",
|
| 152 |
+
"</div>"
|
| 153 |
+
],
|
| 154 |
+
"text/plain": [
|
| 155 |
+
" filter_pass original_cell_id\n",
|
| 156 |
+
"0 0 C_1\n",
|
| 157 |
+
"1 1 C_2\n",
|
| 158 |
+
"2 0 C_3\n",
|
| 159 |
+
"3 1 C_4\n",
|
| 160 |
+
"4 0 C_5\n",
|
| 161 |
+
"... ... ...\n",
|
| 162 |
+
"9590 1 C_9591\n",
|
| 163 |
+
"9591 1 C_9592\n",
|
| 164 |
+
"9592 1 C_9593\n",
|
| 165 |
+
"9593 1 C_9594\n",
|
| 166 |
+
"9594 1 C_9595\n",
|
| 167 |
+
"\n",
|
| 168 |
+
"[9595 rows x 2 columns]"
|
| 169 |
+
]
|
| 170 |
+
},
|
| 171 |
+
"execution_count": 5,
|
| 172 |
+
"metadata": {},
|
| 173 |
+
"output_type": "execute_result"
|
| 174 |
+
}
|
| 175 |
+
],
|
| 176 |
+
"source": [
|
| 177 |
+
"# dictionary of cell barcodes that passed QC filtering applied by Geneformer \n",
|
| 178 |
+
"# to ensure same cells were used for comparison\n",
|
| 179 |
+
"with open(f\"{rootdir}deepsort_filter_dict.pickle\", \"rb\") as fp:\n",
|
| 180 |
+
" filter_dict = pickle.load(fp)\n",
|
| 181 |
+
"\n",
|
| 182 |
+
"# for example:\n",
|
| 183 |
+
"filter_dict[\"human_Placenta9595_data\"]"
|
| 184 |
+
]
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
"cell_type": "code",
|
| 188 |
+
"execution_count": null,
|
| 189 |
+
"id": "207e3571-0236-4493-83b3-a89b67b16cb2",
|
| 190 |
+
"metadata": {
|
| 191 |
+
"tags": []
|
| 192 |
+
},
|
| 193 |
+
"outputs": [],
|
| 194 |
+
"source": [
|
| 195 |
+
"for dir_name in tqdm(dir_list):\n",
|
| 196 |
+
"\n",
|
| 197 |
+
" df = pd.DataFrame()\n",
|
| 198 |
+
" ct_df = pd.DataFrame(columns=[\"Cell\",\"Cell_type\"])\n",
|
| 199 |
+
" \n",
|
| 200 |
+
" subrootdir = f\"{rootdir}{dir_name}/\"\n",
|
| 201 |
+
" for subdir, dirs, files in os.walk(subrootdir):\n",
|
| 202 |
+
" for i in range(len(files)):\n",
|
| 203 |
+
" file = files[i]\n",
|
| 204 |
+
" if file.endswith(\"_data.csv\"):\n",
|
| 205 |
+
" file_prefix = file.replace(\"_data.csv\",\"\")\n",
|
| 206 |
+
" sample_prefix = file.replace(\".csv\",\"\")\n",
|
| 207 |
+
" filter_df = filter_dict[sample_prefix]\n",
|
| 208 |
+
" sample_to_analyze = list(filter_df[filter_df[\"filter_pass\"]==1][\"original_cell_id\"])\n",
|
| 209 |
+
" \n",
|
| 210 |
+
" # collect data for each tissue\n",
|
| 211 |
+
" df_i = pd.read_csv(f\"{subrootdir}{file}\", index_col=0)\n",
|
| 212 |
+
" df_i = df_i[sample_to_analyze]\n",
|
| 213 |
+
" df_i.columns = [f\"{i}_{cell_id}\" for cell_id in df_i.columns]\n",
|
| 214 |
+
" df = pd.concat([df,df_i],axis=1)\n",
|
| 215 |
+
" \n",
|
| 216 |
+
" # collect cell type metadata\n",
|
| 217 |
+
" ct_df_i = pd.read_csv(f\"{subrootdir}{file_prefix}_celltype.csv\", index_col=0)\n",
|
| 218 |
+
" ct_df_i.columns = [\"Cell\",\"Cell_type\"]\n",
|
| 219 |
+
" ct_df_i[\"Cell\"] = [f\"{i}_{cell_id}\" for cell_id in ct_df_i[\"Cell\"]]\n",
|
| 220 |
+
" ct_df = pd.concat([ct_df,ct_df_i],axis=0)\n",
|
| 221 |
+
" \n",
|
| 222 |
+
" # per published scDeepsort method, filter data for cell types >0.5% of data\n",
|
| 223 |
+
" ct_counts = Counter(ct_df[\"Cell_type\"])\n",
|
| 224 |
+
" total_count = sum(ct_counts.values())\n",
|
| 225 |
+
" nonrare_cell_types = [cell_type for cell_type,count in ct_counts.items() if count>(total_count*0.005)]\n",
|
| 226 |
+
" nonrare_cells = list(ct_df[ct_df[\"Cell_type\"].isin(nonrare_cell_types)][\"Cell\"])\n",
|
| 227 |
+
" df = df[df.columns.intersection(nonrare_cells)]\n",
|
| 228 |
+
"\n",
|
| 229 |
+
" # split into 80/20 train/test data\n",
|
| 230 |
+
" train, test = train_test_split(df.T, test_size=0.2)\n",
|
| 231 |
+
" train = train.T\n",
|
| 232 |
+
" test = test.T \n",
|
| 233 |
+
" \n",
|
| 234 |
+
" # save filtered train/test data\n",
|
| 235 |
+
" train.to_csv(f\"{subrootdir}{dir_name}_filtered_data_train.csv\")\n",
|
| 236 |
+
" test.to_csv(f\"{subrootdir}{dir_name}_filtered_data_test.csv\")\n",
|
| 237 |
+
"\n",
|
| 238 |
+
" # split metadata into train/test data\n",
|
| 239 |
+
" ct_df_train = ct_df[ct_df[\"Cell\"].isin(list(train.columns))]\n",
|
| 240 |
+
" ct_df_test = ct_df[ct_df[\"Cell\"].isin(list(test.columns))]\n",
|
| 241 |
+
" train_order_dict = dict(zip(train.columns,[i for i in range(len(train.columns))]))\n",
|
| 242 |
+
" test_order_dict = dict(zip(test.columns,[i for i in range(len(test.columns))]))\n",
|
| 243 |
+
" ct_df_train[\"order\"] = [train_order_dict[cell_id] for cell_id in ct_df_train[\"Cell\"]]\n",
|
| 244 |
+
" ct_df_test[\"order\"] = [test_order_dict[cell_id] for cell_id in ct_df_test[\"Cell\"]]\n",
|
| 245 |
+
" ct_df_train = ct_df_train.sort_values(\"order\")\n",
|
| 246 |
+
" ct_df_test = ct_df_test.sort_values(\"order\")\n",
|
| 247 |
+
" ct_df_train = ct_df_train.drop(\"order\",axis=1)\n",
|
| 248 |
+
" ct_df_test = ct_df_test.drop(\"order\",axis=1)\n",
|
| 249 |
+
" assert list(ct_df_train[\"Cell\"]) == list(train.columns)\n",
|
| 250 |
+
" assert list(ct_df_test[\"Cell\"]) == list(test.columns)\n",
|
| 251 |
+
" train_labels = list(Counter(ct_df_train[\"Cell_type\"]).keys())\n",
|
| 252 |
+
" test_labels = list(Counter(ct_df_test[\"Cell_type\"]).keys())\n",
|
| 253 |
+
" assert set(train_labels) == set(test_labels)\n",
|
| 254 |
+
" \n",
|
| 255 |
+
" # save train/test cell type annotations\n",
|
| 256 |
+
" ct_df_train.to_csv(f\"{subrootdir}{dir_name}_filtered_celltype_train.csv\")\n",
|
| 257 |
+
" ct_df_test.to_csv(f\"{subrootdir}{dir_name}_filtered_celltype_test.csv\")\n",
|
| 258 |
+
" "
|
| 259 |
+
]
|
| 260 |
+
}
|
| 261 |
+
],
|
| 262 |
+
"metadata": {
|
| 263 |
+
"kernelspec": {
|
| 264 |
+
"display_name": "Python 3.8.6 64-bit ('3.8.6')",
|
| 265 |
+
"language": "python",
|
| 266 |
+
"name": "python3"
|
| 267 |
+
},
|
| 268 |
+
"language_info": {
|
| 269 |
+
"codemirror_mode": {
|
| 270 |
+
"name": "ipython",
|
| 271 |
+
"version": 3
|
| 272 |
+
},
|
| 273 |
+
"file_extension": ".py",
|
| 274 |
+
"mimetype": "text/x-python",
|
| 275 |
+
"name": "python",
|
| 276 |
+
"nbconvert_exporter": "python",
|
| 277 |
+
"pygments_lexer": "ipython3",
|
| 278 |
+
"version": "3.8.6"
|
| 279 |
+
},
|
| 280 |
+
"vscode": {
|
| 281 |
+
"interpreter": {
|
| 282 |
+
"hash": "eba1599a1f7e611c14c87ccff6793920aa63510b01fc0e229d6dd014149b8829"
|
| 283 |
+
}
|
| 284 |
+
}
|
| 285 |
+
},
|
| 286 |
+
"nbformat": 4,
|
| 287 |
+
"nbformat_minor": 5
|
| 288 |
+
}
|
benchmarking/randomForest_token_classifier_dosageTF_10k.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
benchmarking/scDeepsort_train_predict.ipynb
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 3,
|
| 6 |
+
"id": "83d8d249-affe-45dd-915e-992b4b35b31a",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import os\n",
|
| 11 |
+
"import numpy as np\n",
|
| 12 |
+
"import pandas as pd\n",
|
| 13 |
+
"import deepsort\n",
|
| 14 |
+
"from sklearn.metrics import accuracy_score, f1_score\n",
|
| 15 |
+
"from tqdm.notebook import tqdm\n",
|
| 16 |
+
"import pickle"
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"cell_type": "code",
|
| 21 |
+
"execution_count": 4,
|
| 22 |
+
"id": "25de46ec-8a41-484d-8e14-d2b19768fc2c",
|
| 23 |
+
"metadata": {},
|
| 24 |
+
"outputs": [],
|
| 25 |
+
"source": [
|
| 26 |
+
"def compute_metrics(labels, preds):\n",
|
| 27 |
+
"\n",
|
| 28 |
+
" # calculate accuracy and macro f1 using sklearn's function\n",
|
| 29 |
+
" acc = accuracy_score(labels, preds)\n",
|
| 30 |
+
" macro_f1 = f1_score(labels, preds, average='macro')\n",
|
| 31 |
+
" return {\n",
|
| 32 |
+
" 'accuracy': acc,\n",
|
| 33 |
+
" 'macro_f1': macro_f1\n",
|
| 34 |
+
" }"
|
| 35 |
+
]
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"cell_type": "code",
|
| 39 |
+
"execution_count": 5,
|
| 40 |
+
"id": "a4029b2b-afca-4300-82a2-082fec59f191",
|
| 41 |
+
"metadata": {},
|
| 42 |
+
"outputs": [
|
| 43 |
+
{
|
| 44 |
+
"data": {
|
| 45 |
+
"text/plain": [
|
| 46 |
+
"['pancreas',\n",
|
| 47 |
+
" 'liver',\n",
|
| 48 |
+
" 'blood',\n",
|
| 49 |
+
" 'lung',\n",
|
| 50 |
+
" 'spleen',\n",
|
| 51 |
+
" 'placenta',\n",
|
| 52 |
+
" 'colorectum',\n",
|
| 53 |
+
" 'kidney',\n",
|
| 54 |
+
" 'brain']"
|
| 55 |
+
]
|
| 56 |
+
},
|
| 57 |
+
"execution_count": 5,
|
| 58 |
+
"metadata": {},
|
| 59 |
+
"output_type": "execute_result"
|
| 60 |
+
}
|
| 61 |
+
],
|
| 62 |
+
"source": [
|
| 63 |
+
"rootdir = \"/path/to/data/\"\n",
|
| 64 |
+
"\n",
|
| 65 |
+
"dir_list = []\n",
|
| 66 |
+
"for dir_i in os.listdir(rootdir):\n",
|
| 67 |
+
" if (\"results\" not in dir_i) & (os.path.isdir(os.path.join(rootdir, dir_i))):\n",
|
| 68 |
+
" dir_list += [dir_i]\n",
|
| 69 |
+
"dir_list"
|
| 70 |
+
]
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"cell_type": "code",
|
| 74 |
+
"execution_count": null,
|
| 75 |
+
"id": "ddcdc5cd-871e-4fd2-8457-18d3049fa76c",
|
| 76 |
+
"metadata": {
|
| 77 |
+
"tags": []
|
| 78 |
+
},
|
| 79 |
+
"outputs": [],
|
| 80 |
+
"source": [
|
| 81 |
+
"output_dir = \"results_EDefault_filtered\"\n",
|
| 82 |
+
"n_epochs = \"Default\" # scDeepsort default epochs = 300\n",
|
| 83 |
+
"\n",
|
| 84 |
+
"results_dict = dict()\n",
|
| 85 |
+
"for dir_name in tqdm(dir_list):\n",
|
| 86 |
+
" print(f\"TRAINING: {dir_name}\")\n",
|
| 87 |
+
" subrootdir = f\"{rootdir}{dir_name}/\"\n",
|
| 88 |
+
" train_files = [(f\"{subrootdir}{dir_name}_filtered_data_train.csv\",f\"{subrootdir}{dir_name}_filtered_celltype_train.csv\")]\n",
|
| 89 |
+
" test_file = f\"{subrootdir}{dir_name}_filtered_data_test.csv\"\n",
|
| 90 |
+
" label_file = f\"{subrootdir}{dir_name}_filtered_celltype_test.csv\"\n",
|
| 91 |
+
" \n",
|
| 92 |
+
" # define the model\n",
|
| 93 |
+
" model = deepsort.DeepSortClassifier(species='human',\n",
|
| 94 |
+
" tissue=dir_name,\n",
|
| 95 |
+
" gpu_id=0,\n",
|
| 96 |
+
" random_seed=1,\n",
|
| 97 |
+
" validation_fraction=0) # use all training data (already held out 20% in test data file)\n",
|
| 98 |
+
"\n",
|
| 99 |
+
" # fit the model\n",
|
| 100 |
+
" model.fit(train_files, save_path=f\"{subrootdir}{output_dir}\")\n",
|
| 101 |
+
" \n",
|
| 102 |
+
" # use the saved model to predict cell types in test data\n",
|
| 103 |
+
" model.predict(input_file=test_file,\n",
|
| 104 |
+
" model_path=f\"{subrootdir}{output_dir}\",\n",
|
| 105 |
+
" save_path=f\"{subrootdir}{output_dir}\",\n",
|
| 106 |
+
" unsure_rate=0,\n",
|
| 107 |
+
" file_type='csv')\n",
|
| 108 |
+
" labels_df = pd.read_csv(label_file)\n",
|
| 109 |
+
" preds_df = pd.read_csv(f\"{subrootdir}{output_dir}/human_{dir_name}_{dir_name}_filtered_data_test.csv\")\n",
|
| 110 |
+
" label_cell_ids = labels_df[\"Cell\"]\n",
|
| 111 |
+
" pred_cell_ids = preds_df[\"index\"]\n",
|
| 112 |
+
" assert list(label_cell_ids) == list(pred_cell_ids)\n",
|
| 113 |
+
" labels = list(labels_df[\"Cell_type\"])\n",
|
| 114 |
+
" if isinstance(preds_df[\"cell_subtype\"][0],float):\n",
|
| 115 |
+
" if np.isnan(preds_df[\"cell_subtype\"][0]):\n",
|
| 116 |
+
" preds = list(preds_df[\"cell_type\"])\n",
|
| 117 |
+
" results = compute_metrics(labels, preds)\n",
|
| 118 |
+
" else:\n",
|
| 119 |
+
" preds1 = list(preds_df[\"cell_type\"])\n",
|
| 120 |
+
" preds2 = list(preds_df[\"cell_subtype\"])\n",
|
| 121 |
+
" results1 = compute_metrics(labels, preds1)\n",
|
| 122 |
+
" results2 = compute_metrics(labels, preds2)\n",
|
| 123 |
+
" if results2[\"accuracy\"] > results1[\"accuracy\"]:\n",
|
| 124 |
+
" results = results2\n",
|
| 125 |
+
" else:\n",
|
| 126 |
+
" results = results1\n",
|
| 127 |
+
" \n",
|
| 128 |
+
" print(f\"{dir_name}: {results}\")\n",
|
| 129 |
+
" results_dict[dir_name] = results\n",
|
| 130 |
+
" with open(f\"{subrootdir}deepsort_E{n_epochs}_filtered_pred_{dir_name}.pickle\", \"wb\") as output_file:\n",
|
| 131 |
+
" pickle.dump(results, output_file)\n",
|
| 132 |
+
"\n",
|
| 133 |
+
"# save results\n",
|
| 134 |
+
"with open(f\"{rootdir}deepsort_E{n_epochs}_filtered_pred_dict.pickle\", \"wb\") as output_file:\n",
|
| 135 |
+
" pickle.dump(results_dict, output_file)\n",
|
| 136 |
+
" "
|
| 137 |
+
]
|
| 138 |
+
}
|
| 139 |
+
],
|
| 140 |
+
"metadata": {
|
| 141 |
+
"kernelspec": {
|
| 142 |
+
"display_name": "Python 3.8.6 64-bit ('3.8.6')",
|
| 143 |
+
"language": "python",
|
| 144 |
+
"name": "python3"
|
| 145 |
+
},
|
| 146 |
+
"language_info": {
|
| 147 |
+
"codemirror_mode": {
|
| 148 |
+
"name": "ipython",
|
| 149 |
+
"version": 3
|
| 150 |
+
},
|
| 151 |
+
"file_extension": ".py",
|
| 152 |
+
"mimetype": "text/x-python",
|
| 153 |
+
"name": "python",
|
| 154 |
+
"nbconvert_exporter": "python",
|
| 155 |
+
"pygments_lexer": "ipython3",
|
| 156 |
+
"version": "3.8.6"
|
| 157 |
+
},
|
| 158 |
+
"vscode": {
|
| 159 |
+
"interpreter": {
|
| 160 |
+
"hash": "eba1599a1f7e611c14c87ccff6793920aa63510b01fc0e229d6dd014149b8829"
|
| 161 |
+
}
|
| 162 |
+
}
|
| 163 |
+
},
|
| 164 |
+
"nbformat": 4,
|
| 165 |
+
"nbformat_minor": 5
|
| 166 |
+
}
|
config.json
CHANGED
|
@@ -2,23 +2,22 @@
|
|
| 2 |
"architectures": [
|
| 3 |
"BertForMaskedLM"
|
| 4 |
],
|
| 5 |
-
"attention_probs_dropout_prob": 0.
|
| 6 |
-
"
|
| 7 |
"hidden_act": "relu",
|
| 8 |
-
"hidden_dropout_prob": 0.
|
| 9 |
-
"hidden_size":
|
| 10 |
"initializer_range": 0.02,
|
| 11 |
-
"intermediate_size":
|
| 12 |
"layer_norm_eps": 1e-12,
|
| 13 |
-
"max_position_embeddings":
|
| 14 |
"model_type": "bert",
|
| 15 |
-
"num_attention_heads":
|
| 16 |
-
"num_hidden_layers":
|
| 17 |
"pad_token_id": 0,
|
| 18 |
"position_embedding_type": "absolute",
|
| 19 |
-
"
|
| 20 |
-
"transformers_version": "4.44.2",
|
| 21 |
"type_vocab_size": 2,
|
| 22 |
"use_cache": true,
|
| 23 |
-
"vocab_size":
|
| 24 |
}
|
|
|
|
| 2 |
"architectures": [
|
| 3 |
"BertForMaskedLM"
|
| 4 |
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.02,
|
| 6 |
+
"gradient_checkpointing": false,
|
| 7 |
"hidden_act": "relu",
|
| 8 |
+
"hidden_dropout_prob": 0.02,
|
| 9 |
+
"hidden_size": 256,
|
| 10 |
"initializer_range": 0.02,
|
| 11 |
+
"intermediate_size": 512,
|
| 12 |
"layer_norm_eps": 1e-12,
|
| 13 |
+
"max_position_embeddings": 2048,
|
| 14 |
"model_type": "bert",
|
| 15 |
+
"num_attention_heads": 4,
|
| 16 |
+
"num_hidden_layers": 6,
|
| 17 |
"pad_token_id": 0,
|
| 18 |
"position_embedding_type": "absolute",
|
| 19 |
+
"transformers_version": "4.6.0",
|
|
|
|
| 20 |
"type_vocab_size": 2,
|
| 21 |
"use_cache": true,
|
| 22 |
+
"vocab_size": 25426
|
| 23 |
}
|
docs/Makefile
DELETED
|
@@ -1,20 +0,0 @@
|
|
| 1 |
-
# Minimal makefile for Sphinx documentation
|
| 2 |
-
#
|
| 3 |
-
|
| 4 |
-
# You can set these variables from the command line, and also
|
| 5 |
-
# from the environment for the first two.
|
| 6 |
-
SPHINXOPTS ?=
|
| 7 |
-
SPHINXBUILD ?= sphinx-build
|
| 8 |
-
SOURCEDIR = source
|
| 9 |
-
BUILDDIR = build
|
| 10 |
-
|
| 11 |
-
# Put it first so that "make" without argument is like "make help".
|
| 12 |
-
help:
|
| 13 |
-
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
| 14 |
-
|
| 15 |
-
.PHONY: help Makefile
|
| 16 |
-
|
| 17 |
-
# Catch-all target: route all unknown targets to Sphinx using the new
|
| 18 |
-
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
| 19 |
-
%: Makefile
|
| 20 |
-
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/make.bat
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
@ECHO OFF
|
| 2 |
-
|
| 3 |
-
pushd %~dp0
|
| 4 |
-
|
| 5 |
-
REM Command file for Sphinx documentation
|
| 6 |
-
|
| 7 |
-
if "%SPHINXBUILD%" == "" (
|
| 8 |
-
set SPHINXBUILD=sphinx-build
|
| 9 |
-
)
|
| 10 |
-
set SOURCEDIR=source
|
| 11 |
-
set BUILDDIR=build
|
| 12 |
-
|
| 13 |
-
%SPHINXBUILD% >NUL 2>NUL
|
| 14 |
-
if errorlevel 9009 (
|
| 15 |
-
echo.
|
| 16 |
-
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
| 17 |
-
echo.installed, then set the SPHINXBUILD environment variable to point
|
| 18 |
-
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
| 19 |
-
echo.may add the Sphinx directory to PATH.
|
| 20 |
-
echo.
|
| 21 |
-
echo.If you don't have Sphinx installed, grab it from
|
| 22 |
-
echo.https://www.sphinx-doc.org/
|
| 23 |
-
exit /b 1
|
| 24 |
-
)
|
| 25 |
-
|
| 26 |
-
if "%1" == "" goto help
|
| 27 |
-
|
| 28 |
-
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
| 29 |
-
goto end
|
| 30 |
-
|
| 31 |
-
:help
|
| 32 |
-
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
| 33 |
-
|
| 34 |
-
:end
|
| 35 |
-
popd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/requirements.txt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
.
|
| 2 |
-
sphinx_rtd_theme==2.0.0
|
| 3 |
-
nbsphinx==0.9.3
|
|
|
|
|
|
|
|
|
|
|
|
docs/source/_static/css/custom.css
DELETED
|
@@ -1,40 +0,0 @@
|
|
| 1 |
-
/* top left logo */
|
| 2 |
-
.wy-side-nav-search, .wy-nav-top {
|
| 3 |
-
background: linear-gradient(15deg, #13547a 0%, #80d0c7 100%);
|
| 4 |
-
}
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
/* unvisited link */
|
| 8 |
-
.wy-nav-content a:link {
|
| 9 |
-
color: #067abd;
|
| 10 |
-
}
|
| 11 |
-
|
| 12 |
-
/* visited link */
|
| 13 |
-
.wy-nav-content a:visited {
|
| 14 |
-
color: #4b827c;
|
| 15 |
-
}
|
| 16 |
-
|
| 17 |
-
/* mouse over link */
|
| 18 |
-
.wy-nav-content a:hover {
|
| 19 |
-
color: #80d0c7;
|
| 20 |
-
}
|
| 21 |
-
|
| 22 |
-
/* selected link */
|
| 23 |
-
.wy-nav-content a:active {
|
| 24 |
-
color: #4b827c;
|
| 25 |
-
}
|
| 26 |
-
|
| 27 |
-
/* class object */
|
| 28 |
-
.sig.sig-object {
|
| 29 |
-
padding: 5px 5px 5px 5px;
|
| 30 |
-
background-color: #ececec;
|
| 31 |
-
border-style: solid;
|
| 32 |
-
border-color: black;
|
| 33 |
-
border-width: 1px 0;
|
| 34 |
-
}
|
| 35 |
-
|
| 36 |
-
/* parameter object */
|
| 37 |
-
dt {
|
| 38 |
-
padding: 5px 5px 5px 5px;
|
| 39 |
-
background-color: #ececec;
|
| 40 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/source/_static/gf_logo.png
DELETED
|
Binary file (48.2 kB)
|
|
|
docs/source/about.rst
DELETED
|
@@ -1,49 +0,0 @@
|
|
| 1 |
-
About
|
| 2 |
-
=====
|
| 3 |
-
|
| 4 |
-
Model Description
|
| 5 |
-
-----------------
|
| 6 |
-
|
| 7 |
-
**Geneformer** is a context-aware, attention-based deep learning model pretrained on a large-scale corpus of single-cell transcriptomes to enable context-specific predictions in settings with limited data in network biology. During pretraining, Geneformer gained a fundamental understanding of network dynamics, encoding network hierarchy in the attention weights of the model in a completely self-supervised manner. With both zero-shot learning and fine-tuning with limited task-specific data, Geneformer consistently boosted predictive accuracy in a diverse panel of downstream tasks relevant to chromatin and network dynamics. In silico perturbation with zero-shot learning identified a novel transcription factor in cardiomyocytes that we experimentally validated to be critical to their ability to generate contractile force. In silico treatment with limited patient data revealed candidate therapeutic targets for cardiomyopathy that we experimentally validated to significantly improve the ability of cardiomyocytes to generate contractile force in an iPSC model of the disease. Overall, Geneformer represents a foundational deep learning model pretrained on a large-scale corpus of human single cell transcriptomes to gain a fundamental understanding of gene network dynamics that can now be democratized to a vast array of downstream tasks to accelerate discovery of key network regulators and candidate therapeutic targets.
|
| 8 |
-
|
| 9 |
-
In `our manuscript <https://rdcu.be/ddrx0>`_, we report results for the original 6 layer Geneformer model pretrained on Genecorpus-30M. We additionally provide within the repository a 12 layer Geneformer model, scaled up with retained width:depth aspect ratio, also pretrained on Genecorpus-30M.
|
| 10 |
-
|
| 11 |
-
Both the `6 <https://huggingface.co/ctheodoris/Geneformer/blob/main/gf-6L-30M-i2048/model.safetensors>`_ and `12 <https://huggingface.co/ctheodoris/Geneformer/blob/main/gf-12L-30M-i2048/pytorch_model.bin>`_ layer Geneformer models were pretrained in June 2021.
|
| 12 |
-
|
| 13 |
-
Also see `our 2024 manuscript <https://www.biorxiv.org/content/10.1101/2024.08.16.608180v1.full.pdf>`_, for details of the `expanded model <https://huggingface.co/ctheodoris/Geneformer/blob/main/model.safetensors>`_ trained on ~95 million transcriptomes in April 2024 and our continual learning, multitask learning, and quantization strategies.
|
| 14 |
-
|
| 15 |
-
Application
|
| 16 |
-
-----------
|
| 17 |
-
|
| 18 |
-
The pretrained Geneformer model can be used directly for zero-shot learning, for example for in silico perturbation analysis, or by fine-tuning towards the relevant downstream task, such as gene or cell state classification.
|
| 19 |
-
|
| 20 |
-
Example applications demonstrated in `our manuscript <https://rdcu.be/ddrx0>`_ include:
|
| 21 |
-
|
| 22 |
-
| *Fine-tuning*:
|
| 23 |
-
| - transcription factor dosage sensitivity
|
| 24 |
-
| - chromatin dynamics (bivalently marked promoters)
|
| 25 |
-
| - transcription factor regulatory range
|
| 26 |
-
| - gene network centrality
|
| 27 |
-
| - transcription factor targets
|
| 28 |
-
| - cell type annotation
|
| 29 |
-
| - batch integration
|
| 30 |
-
| - cell state classification across differentiation
|
| 31 |
-
| - disease classification
|
| 32 |
-
| - in silico perturbation to determine disease-driving genes
|
| 33 |
-
| - in silico treatment to determine candidate therapeutic targets
|
| 34 |
-
|
| 35 |
-
| *Zero-shot learning*:
|
| 36 |
-
| - batch integration
|
| 37 |
-
| - gene context specificity
|
| 38 |
-
| - in silico reprogramming
|
| 39 |
-
| - in silico differentiation
|
| 40 |
-
| - in silico perturbation to determine impact on cell state
|
| 41 |
-
| - in silico perturbation to determine transcription factor targets
|
| 42 |
-
| - in silico perturbation to determine transcription factor cooperativity
|
| 43 |
-
|
| 44 |
-
Citations
|
| 45 |
-
---------
|
| 46 |
-
|
| 47 |
-
| C V Theodoris #, L Xiao, A Chopra, M D Chaffin, Z R Al Sayed, M C Hill, H Mantineo, E Brydon, Z Zeng, X S Liu, P T Ellinor #. `Transfer learning enables predictions in network biology. <https://rdcu.be/ddrx0>`_ *Nature*, 31 May 2023. (# co-corresponding authors)
|
| 48 |
-
|
| 49 |
-
| H Chen \*, M S Venkatesh \*, J Gomez Ortega, S V Mahesh, T Nandi, R Madduri, K Pelka †, C V Theodoris † #. `Quantized multi-task learning for context-specific representations of gene network dynamics. <https://www.biorxiv.org/content/10.1101/2024.08.16.608180v1.full.pdf>`_ *bioRxiv*, 19 Aug 2024. (\* co-first authors, † co-senior authors, # corresponding author)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/source/api.rst
DELETED
|
@@ -1,51 +0,0 @@
|
|
| 1 |
-
API
|
| 2 |
-
===
|
| 3 |
-
|
| 4 |
-
Tokenizer
|
| 5 |
-
---------
|
| 6 |
-
|
| 7 |
-
.. toctree::
|
| 8 |
-
:maxdepth: 1
|
| 9 |
-
|
| 10 |
-
geneformer.tokenizer
|
| 11 |
-
|
| 12 |
-
Classifier
|
| 13 |
-
----------
|
| 14 |
-
|
| 15 |
-
.. toctree::
|
| 16 |
-
:maxdepth: 1
|
| 17 |
-
|
| 18 |
-
geneformer.classifier
|
| 19 |
-
|
| 20 |
-
Multitask Classifier
|
| 21 |
-
--------------------
|
| 22 |
-
|
| 23 |
-
.. toctree::
|
| 24 |
-
:maxdepth: 1
|
| 25 |
-
|
| 26 |
-
geneformer.mtl_classifier
|
| 27 |
-
|
| 28 |
-
Embedding Extractor
|
| 29 |
-
-------------------
|
| 30 |
-
|
| 31 |
-
.. toctree::
|
| 32 |
-
:maxdepth: 1
|
| 33 |
-
|
| 34 |
-
geneformer.emb_extractor
|
| 35 |
-
|
| 36 |
-
In Silico Perturber
|
| 37 |
-
-------------------
|
| 38 |
-
|
| 39 |
-
.. toctree::
|
| 40 |
-
:maxdepth: 1
|
| 41 |
-
|
| 42 |
-
geneformer.in_silico_perturber
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
In Silico Perturber Stats
|
| 46 |
-
-------------------------
|
| 47 |
-
|
| 48 |
-
.. toctree::
|
| 49 |
-
:maxdepth: 1
|
| 50 |
-
|
| 51 |
-
geneformer.in_silico_perturber_stats
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/source/conf.py
DELETED
|
@@ -1,80 +0,0 @@
|
|
| 1 |
-
# Configuration file for the Sphinx documentation builder.
|
| 2 |
-
#
|
| 3 |
-
# For the full list of built-in configuration values, see the documentation:
|
| 4 |
-
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
| 5 |
-
|
| 6 |
-
import pathlib
|
| 7 |
-
import re
|
| 8 |
-
import sys
|
| 9 |
-
|
| 10 |
-
from sphinx.ext import autodoc
|
| 11 |
-
|
| 12 |
-
sys.path.insert(0, pathlib.Path(__file__).parents[2].resolve().as_posix())
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
# -- Project information -----------------------------------------------------
|
| 16 |
-
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
| 17 |
-
|
| 18 |
-
project = "geneformer"
|
| 19 |
-
copyright = "2024, Christina Theodoris"
|
| 20 |
-
author = "Christina Theodoris"
|
| 21 |
-
release = "0.1.0"
|
| 22 |
-
repository_url = "https://huggingface.co/ctheodoris/Geneformer"
|
| 23 |
-
|
| 24 |
-
# -- General configuration ---------------------------------------------------
|
| 25 |
-
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
| 26 |
-
|
| 27 |
-
extensions = [
|
| 28 |
-
"sphinx.ext.autodoc",
|
| 29 |
-
"sphinx.ext.autosummary",
|
| 30 |
-
"nbsphinx",
|
| 31 |
-
"sphinx.ext.viewcode",
|
| 32 |
-
"sphinx.ext.doctest",
|
| 33 |
-
]
|
| 34 |
-
|
| 35 |
-
templates_path = ["_templates"]
|
| 36 |
-
exclude_patterns = [
|
| 37 |
-
"**.ipynb_checkpoints",
|
| 38 |
-
]
|
| 39 |
-
autoclass_content = "both"
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
class MockedClassDocumenter(autodoc.ClassDocumenter):
|
| 43 |
-
def add_line(self, line: str, source: str, *lineno: int) -> None:
|
| 44 |
-
if line == " Bases: :py:class:`object`":
|
| 45 |
-
return
|
| 46 |
-
super().add_line(line, source, *lineno)
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
autodoc.ClassDocumenter = MockedClassDocumenter
|
| 50 |
-
add_module_names = False
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
def process_signature(app, what, name, obj, options, signature, return_annotation):
|
| 54 |
-
# loop through each line in the docstring and replace path with
|
| 55 |
-
# the generic path text
|
| 56 |
-
signature = re.sub(r"PosixPath\(.*?\)", "FILEPATH", signature)
|
| 57 |
-
return (signature, None)
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
def setup(app):
|
| 61 |
-
app.connect("autodoc-process-signature", process_signature)
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
# -- Options for HTML output -------------------------------------------------
|
| 65 |
-
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
| 66 |
-
|
| 67 |
-
html_theme = "sphinx_rtd_theme"
|
| 68 |
-
html_show_sphinx = False
|
| 69 |
-
html_static_path = ["_static"]
|
| 70 |
-
html_logo = "_static/gf_logo.png"
|
| 71 |
-
html_theme_options = {
|
| 72 |
-
"collapse_navigation": False,
|
| 73 |
-
"sticky_navigation": True,
|
| 74 |
-
"navigation_depth": 3,
|
| 75 |
-
"logo_only": True,
|
| 76 |
-
}
|
| 77 |
-
html_css_files = [
|
| 78 |
-
"css/custom.css",
|
| 79 |
-
]
|
| 80 |
-
html_show_sourcelink = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/source/geneformer.classifier.rst
DELETED
|
@@ -1,10 +0,0 @@
|
|
| 1 |
-
geneformer.classifier
|
| 2 |
-
=====================
|
| 3 |
-
|
| 4 |
-
.. automodule:: geneformer.classifier
|
| 5 |
-
:members:
|
| 6 |
-
:undoc-members:
|
| 7 |
-
:show-inheritance:
|
| 8 |
-
:exclude-members:
|
| 9 |
-
valid_option_dict,
|
| 10 |
-
validate_options
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/source/geneformer.emb_extractor.rst
DELETED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
geneformer.emb\_extractor
|
| 2 |
-
=========================
|
| 3 |
-
|
| 4 |
-
.. automodule:: geneformer.emb_extractor
|
| 5 |
-
:members:
|
| 6 |
-
:undoc-members:
|
| 7 |
-
:show-inheritance:
|
| 8 |
-
:exclude-members:
|
| 9 |
-
accumulate_tdigests,
|
| 10 |
-
gen_heatmap_class_colors,
|
| 11 |
-
gen_heatmap_class_dict,
|
| 12 |
-
get_embs,
|
| 13 |
-
label_cell_embs,
|
| 14 |
-
label_gene_embs,
|
| 15 |
-
make_colorbar,
|
| 16 |
-
plot_heatmap,
|
| 17 |
-
plot_umap,
|
| 18 |
-
summarize_gene_embs,
|
| 19 |
-
tdigest_mean,
|
| 20 |
-
tdigest_median,
|
| 21 |
-
test_emb,
|
| 22 |
-
update_tdigest_dict,
|
| 23 |
-
update_tdigest_dict_mean,
|
| 24 |
-
update_tdigest_dict_median,
|
| 25 |
-
valid_option_dict,
|
| 26 |
-
validate_options
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/source/geneformer.in_silico_perturber.rst
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
geneformer.in\_silico\_perturber
|
| 2 |
-
=======================================
|
| 3 |
-
|
| 4 |
-
.. automodule:: geneformer.in_silico_perturber
|
| 5 |
-
:members:
|
| 6 |
-
:undoc-members:
|
| 7 |
-
:show-inheritance:
|
| 8 |
-
:exclude-members: valid_option_dict, validate_options, apply_additional_filters, isp_perturb_all, isp_perturb_set, , isp_perturb_all_special, isp_perturb_set_special, update_perturbation_dictionary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/source/geneformer.in_silico_perturber_stats.rst
DELETED
|
@@ -1,25 +0,0 @@
|
|
| 1 |
-
geneformer.in\_silico\_perturber\_stats
|
| 2 |
-
==============================================
|
| 3 |
-
|
| 4 |
-
.. automodule:: geneformer.in_silico_perturber_stats
|
| 5 |
-
:members:
|
| 6 |
-
:undoc-members:
|
| 7 |
-
:show-inheritance:
|
| 8 |
-
:exclude-members:
|
| 9 |
-
find,
|
| 10 |
-
get_fdr,
|
| 11 |
-
get_gene_list,
|
| 12 |
-
get_impact_component,
|
| 13 |
-
invert_dict,
|
| 14 |
-
isp_aggregate_gene_shifts,
|
| 15 |
-
isp_aggregate_grouped_perturb,
|
| 16 |
-
isp_stats_mixture_model,
|
| 17 |
-
isp_stats_to_goal_state,
|
| 18 |
-
isp_stats_vs_null,
|
| 19 |
-
n_detections,
|
| 20 |
-
read_dict,
|
| 21 |
-
read_dictionaries,
|
| 22 |
-
token_to_gene_name,
|
| 23 |
-
token_tuple_to_ensembl_ids,
|
| 24 |
-
valid_option_dict,
|
| 25 |
-
validate_options
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/source/geneformer.mtl_classifier.rst
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
geneformer.mtl\_classifier
|
| 2 |
-
==========================
|
| 3 |
-
|
| 4 |
-
.. automodule:: geneformer.mtl_classifier
|
| 5 |
-
:members:
|
| 6 |
-
:undoc-members:
|
| 7 |
-
:show-inheritance:
|
| 8 |
-
:exclude-members:
|
| 9 |
-
valid_option_dict,
|
| 10 |
-
validate_options,
|
| 11 |
-
validate_additional_options
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/source/geneformer.tokenizer.rst
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
geneformer.tokenizer
|
| 2 |
-
====================
|
| 3 |
-
|
| 4 |
-
.. automodule:: geneformer.tokenizer
|
| 5 |
-
:members:
|
| 6 |
-
:undoc-members:
|
| 7 |
-
:show-inheritance:
|
| 8 |
-
:exclude-members:
|
| 9 |
-
create_dataset,
|
| 10 |
-
tokenize_anndata,
|
| 11 |
-
tokenize_files,
|
| 12 |
-
tokenize_loom,
|
| 13 |
-
rank_genes,
|
| 14 |
-
tokenize_cell,
|
| 15 |
-
sum_ensembl_ids
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/source/getstarted.rst
DELETED
|
@@ -1,36 +0,0 @@
|
|
| 1 |
-
Getting Started
|
| 2 |
-
===============
|
| 3 |
-
|
| 4 |
-
Installation
|
| 5 |
-
------------
|
| 6 |
-
|
| 7 |
-
Geneformer installation instructions.
|
| 8 |
-
|
| 9 |
-
Make sure you have git-lfs installed (https://git-lfs.com).
|
| 10 |
-
|
| 11 |
-
.. code-block:: bash
|
| 12 |
-
|
| 13 |
-
git lfs install
|
| 14 |
-
git clone https://huggingface.co/ctheodoris/Geneformer
|
| 15 |
-
cd Geneformer
|
| 16 |
-
pip install .
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
Tutorials
|
| 20 |
-
---------
|
| 21 |
-
|
| 22 |
-
| See `examples <https://huggingface.co/ctheodoris/Geneformer/tree/main/examples>`_ for:
|
| 23 |
-
| - tokenizing transcriptomes
|
| 24 |
-
| - pretraining
|
| 25 |
-
| - hyperparameter tuning
|
| 26 |
-
| - fine-tuning
|
| 27 |
-
| - extracting and plotting cell embeddings
|
| 28 |
-
| - in silico perturbation
|
| 29 |
-
|
| 30 |
-
Please note that the fine-tuning examples are meant to be generally applicable and the input datasets and labels will vary dependent on the downstream task. Example input files for a few of the downstream tasks demonstrated in the manuscript are located within the `example_input_files directory <https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files>`_ in the dataset repository, but these only represent a few example fine-tuning applications.
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
Tips
|
| 34 |
-
----
|
| 35 |
-
|
| 36 |
-
Please note that GPU resources are required for efficient usage of Geneformer. Additionally, we strongly recommend tuning hyperparameters for each downstream fine-tuning application as this can significantly boost predictive potential in the downstream task (e.g. max learning rate, learning schedule, number of layers to freeze, etc.).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/source/index.rst
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
Geneformer
|
| 2 |
-
==========
|
| 3 |
-
|
| 4 |
-
Geneformer is a foundation transformer model pretrained on a large-scale corpus of single cell transcriptomes to enable context-aware predictions in network biology.
|
| 5 |
-
|
| 6 |
-
See `our manuscript <https://rdcu.be/ddrx0>`_ for details.
|
| 7 |
-
|
| 8 |
-
Table of Contents
|
| 9 |
-
-----------------
|
| 10 |
-
|
| 11 |
-
.. toctree::
|
| 12 |
-
:maxdepth: 2
|
| 13 |
-
|
| 14 |
-
about
|
| 15 |
-
getstarted
|
| 16 |
-
api
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/cell_classification.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/distributed_multitask_cell_classification.ipynb
DELETED
|
@@ -1,149 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"cells": [
|
| 3 |
-
{
|
| 4 |
-
"cell_type": "code",
|
| 5 |
-
"execution_count": null,
|
| 6 |
-
"id": "b3266a7b",
|
| 7 |
-
"metadata": {},
|
| 8 |
-
"outputs": [],
|
| 9 |
-
"source": [
|
| 10 |
-
"import os\n",
|
| 11 |
-
"import torch\n",
|
| 12 |
-
"from geneformer import MTLClassifier"
|
| 13 |
-
]
|
| 14 |
-
},
|
| 15 |
-
{
|
| 16 |
-
"cell_type": "code",
|
| 17 |
-
"execution_count": null,
|
| 18 |
-
"id": "3e12ac9f",
|
| 19 |
-
"metadata": {},
|
| 20 |
-
"outputs": [],
|
| 21 |
-
"source": [
|
| 22 |
-
"# Define paths\n",
|
| 23 |
-
"pretrained_path = \"/path/to/pretrained/Geneformer/model\" \n",
|
| 24 |
-
"# input data is tokenized rank value encodings generated by Geneformer tokenizer (see tokenizing_scRNAseq_data.ipynb)\n",
|
| 25 |
-
"train_path = \"/path/to/train/data.dataset\"\n",
|
| 26 |
-
"val_path = \"/path/to/val/data.dataset\"\n",
|
| 27 |
-
"test_path = \"/path/to/test/data.dataset\"\n",
|
| 28 |
-
"results_dir = \"/path/to/results/directory\"\n",
|
| 29 |
-
"model_save_path = \"/path/to/model/save/path\"\n",
|
| 30 |
-
"tensorboard_log_dir = \"/path/to/tensorboard/log/dir\"\n",
|
| 31 |
-
"\n",
|
| 32 |
-
"# Define tasks and hyperparameters\n",
|
| 33 |
-
"# task_columns should be a list of column names from your dataset\n",
|
| 34 |
-
"# Each column represents a specific classification task (e.g. cell type, disease state)\n",
|
| 35 |
-
"task_columns = [\"cell_type\", \"disease_state\"] # Example task columns"
|
| 36 |
-
]
|
| 37 |
-
},
|
| 38 |
-
{
|
| 39 |
-
"cell_type": "code",
|
| 40 |
-
"execution_count": null,
|
| 41 |
-
"id": "c9bd7562",
|
| 42 |
-
"metadata": {},
|
| 43 |
-
"outputs": [],
|
| 44 |
-
"source": [
|
| 45 |
-
"# Check GPU environment\n",
|
| 46 |
-
"num_gpus = torch.cuda.device_count()\n",
|
| 47 |
-
"use_distributed = num_gpus > 1\n",
|
| 48 |
-
"print(f\"Number of GPUs detected: {num_gpus}\")\n",
|
| 49 |
-
"print(f\"Using distributed training: {use_distributed}\")\n",
|
| 50 |
-
"\n",
|
| 51 |
-
"# Set environment variables for distributed training when multiple GPUs are available\n",
|
| 52 |
-
"if use_distributed:\n",
|
| 53 |
-
" os.environ[\"MASTER_ADDR\"] = \"localhost\" # hostname\n",
|
| 54 |
-
" os.environ[\"MASTER_PORT\"] = \"12355\" # Choose an available port\n",
|
| 55 |
-
" print(\"Distributed environment variables set.\")"
|
| 56 |
-
]
|
| 57 |
-
},
|
| 58 |
-
{
|
| 59 |
-
"cell_type": "code",
|
| 60 |
-
"execution_count": null,
|
| 61 |
-
"id": "b6ff3618",
|
| 62 |
-
"metadata": {},
|
| 63 |
-
"outputs": [],
|
| 64 |
-
"source": [
|
| 65 |
-
"#Define Hyperparameters for Optimization\n",
|
| 66 |
-
"hyperparameters = {\n",
|
| 67 |
-
" \"learning_rate\": {\"type\": \"float\", \"low\": 1e-5, \"high\": 1e-3, \"log\": True},\n",
|
| 68 |
-
" \"warmup_ratio\": {\"type\": \"float\", \"low\": 0.005, \"high\": 0.01},\n",
|
| 69 |
-
" \"weight_decay\": {\"type\": \"float\", \"low\": 0.01, \"high\": 0.1},\n",
|
| 70 |
-
" \"dropout_rate\": {\"type\": \"float\", \"low\": 0.0, \"high\": 0.7},\n",
|
| 71 |
-
" \"lr_scheduler_type\": {\"type\": \"categorical\", \"choices\": [\"cosine\"]},\n",
|
| 72 |
-
" \"task_weights\": {\"type\": \"float\", \"low\": 0.1, \"high\": 2.0},\n",
|
| 73 |
-
"}"
|
| 74 |
-
]
|
| 75 |
-
},
|
| 76 |
-
{
|
| 77 |
-
"cell_type": "code",
|
| 78 |
-
"execution_count": null,
|
| 79 |
-
"id": "f665c5a7",
|
| 80 |
-
"metadata": {},
|
| 81 |
-
"outputs": [],
|
| 82 |
-
"source": [
|
| 83 |
-
"mc = MTLClassifier(\n",
|
| 84 |
-
" task_columns=task_columns, # Our defined classification tasks\n",
|
| 85 |
-
" study_name=\"MTLClassifier_distributed\",\n",
|
| 86 |
-
" pretrained_path=pretrained_path,\n",
|
| 87 |
-
" train_path=train_path,\n",
|
| 88 |
-
" val_path=val_path,\n",
|
| 89 |
-
" test_path=test_path,\n",
|
| 90 |
-
" model_save_path=model_save_path,\n",
|
| 91 |
-
" results_dir=results_dir,\n",
|
| 92 |
-
" tensorboard_log_dir=tensorboard_log_dir,\n",
|
| 93 |
-
" hyperparameters=hyperparameters,\n",
|
| 94 |
-
" # Distributed training parameters\n",
|
| 95 |
-
" distributed_training=use_distributed, # Enable distributed training if multiple GPUs available\n",
|
| 96 |
-
" master_addr=\"localhost\" if use_distributed else None,\n",
|
| 97 |
-
" master_port=\"12355\" if use_distributed else None,\n",
|
| 98 |
-
" # Other training parameters\n",
|
| 99 |
-
" n_trials=15, # Number of trials for hyperparameter optimization\n",
|
| 100 |
-
" epochs=1, # Number of training epochs (1 suggested to prevent overfitting)\n",
|
| 101 |
-
" batch_size=8, # Adjust based on available GPU memory\n",
|
| 102 |
-
" gradient_accumulation_steps=4, # Accumulate gradients over multiple steps\n",
|
| 103 |
-
" gradient_clipping=True, # Enable gradient clipping for stability\n",
|
| 104 |
-
" max_grad_norm=1.0, # Set maximum gradient norm\n",
|
| 105 |
-
" seed=42\n",
|
| 106 |
-
")"
|
| 107 |
-
]
|
| 108 |
-
},
|
| 109 |
-
{
|
| 110 |
-
"cell_type": "code",
|
| 111 |
-
"execution_count": null,
|
| 112 |
-
"id": "f69f7b6a",
|
| 113 |
-
"metadata": {},
|
| 114 |
-
"outputs": [],
|
| 115 |
-
"source": [
|
| 116 |
-
"# Run Hyperparameter Optimization with Distributed Training\n",
|
| 117 |
-
"if __name__ == \"__main__\":\n",
|
| 118 |
-
" # This guard is required for distributed training to prevent\n",
|
| 119 |
-
" # infinite subprocess spawning when using torch.multiprocessing\n",
|
| 120 |
-
" mc.run_optuna_study()"
|
| 121 |
-
]
|
| 122 |
-
},
|
| 123 |
-
{
|
| 124 |
-
"cell_type": "code",
|
| 125 |
-
"execution_count": null,
|
| 126 |
-
"id": "3affd5dd",
|
| 127 |
-
"metadata": {},
|
| 128 |
-
"outputs": [],
|
| 129 |
-
"source": [
|
| 130 |
-
"# Evaluate the Model on Test Data\n",
|
| 131 |
-
"if __name__ == \"__main__\":\n",
|
| 132 |
-
" mc.load_and_evaluate_test_model()"
|
| 133 |
-
]
|
| 134 |
-
}
|
| 135 |
-
],
|
| 136 |
-
"metadata": {
|
| 137 |
-
"kernelspec": {
|
| 138 |
-
"display_name": "bio",
|
| 139 |
-
"language": "python",
|
| 140 |
-
"name": "python3"
|
| 141 |
-
},
|
| 142 |
-
"language_info": {
|
| 143 |
-
"name": "python",
|
| 144 |
-
"version": "3.12.8"
|
| 145 |
-
}
|
| 146 |
-
},
|
| 147 |
-
"nbformat": 4,
|
| 148 |
-
"nbformat_minor": 5
|
| 149 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/example_input_files/bivalent_promoters/bivalent_gene_labels.txt
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ENSG00000005073
|
| 2 |
+
ENSG00000007372
|
| 3 |
+
ENSG00000007372
|
| 4 |
+
ENSG00000043355
|
| 5 |
+
ENSG00000068120
|
| 6 |
+
ENSG00000075891
|
| 7 |
+
ENSG00000078399
|
| 8 |
+
ENSG00000105991
|
| 9 |
+
ENSG00000105996
|
| 10 |
+
ENSG00000105997
|
| 11 |
+
ENSG00000106004
|
| 12 |
+
ENSG00000106006
|
| 13 |
+
ENSG00000106031
|
| 14 |
+
ENSG00000106038
|
| 15 |
+
ENSG00000107807
|
| 16 |
+
ENSG00000107821
|
| 17 |
+
ENSG00000107831
|
| 18 |
+
ENSG00000107859
|
| 19 |
+
ENSG00000107862
|
| 20 |
+
ENSG00000108511
|
| 21 |
+
ENSG00000108786
|
| 22 |
+
ENSG00000108797
|
| 23 |
+
ENSG00000110693
|
| 24 |
+
ENSG00000110693
|
| 25 |
+
ENSG00000113430
|
| 26 |
+
ENSG00000115844
|
| 27 |
+
ENSG00000117707
|
| 28 |
+
ENSG00000117707
|
| 29 |
+
ENSG00000119915
|
| 30 |
+
ENSG00000120068
|
| 31 |
+
ENSG00000120075
|
| 32 |
+
ENSG00000120093
|
| 33 |
+
ENSG00000120093
|
| 34 |
+
ENSG00000120094
|
| 35 |
+
ENSG00000122592
|
| 36 |
+
ENSG00000125285
|
| 37 |
+
ENSG00000125798
|
| 38 |
+
ENSG00000125813
|
| 39 |
+
ENSG00000125813
|
| 40 |
+
ENSG00000125816
|
| 41 |
+
ENSG00000125820
|
| 42 |
+
ENSG00000128573
|
| 43 |
+
ENSG00000128645
|
| 44 |
+
ENSG00000128652
|
| 45 |
+
ENSG00000128709
|
| 46 |
+
ENSG00000128710
|
| 47 |
+
ENSG00000128713
|
| 48 |
+
ENSG00000128714
|
| 49 |
+
ENSG00000129514
|
| 50 |
+
ENSG00000131196
|
| 51 |
+
ENSG00000131196
|
| 52 |
+
ENSG00000136327
|
| 53 |
+
ENSG00000136944
|
| 54 |
+
ENSG00000138083
|
| 55 |
+
ENSG00000139800
|
| 56 |
+
ENSG00000143013
|
| 57 |
+
ENSG00000143632
|
| 58 |
+
ENSG00000144355
|
| 59 |
+
ENSG00000148680
|
| 60 |
+
ENSG00000148826
|
| 61 |
+
ENSG00000151615
|
| 62 |
+
ENSG00000152192
|
| 63 |
+
ENSG00000152977
|
| 64 |
+
ENSG00000159184
|
| 65 |
+
ENSG00000159387
|
| 66 |
+
ENSG00000163412
|
| 67 |
+
ENSG00000163421
|
| 68 |
+
ENSG00000163623
|
| 69 |
+
ENSG00000164330
|
| 70 |
+
ENSG00000164438
|
| 71 |
+
ENSG00000164690
|
| 72 |
+
ENSG00000164778
|
| 73 |
+
ENSG00000165588
|
| 74 |
+
ENSG00000165588
|
| 75 |
+
ENSG00000165588
|
| 76 |
+
ENSG00000166407
|
| 77 |
+
ENSG00000166407
|
| 78 |
+
ENSG00000168505
|
| 79 |
+
ENSG00000168875
|
| 80 |
+
ENSG00000169946
|
| 81 |
+
ENSG00000170166
|
| 82 |
+
ENSG00000170178
|
| 83 |
+
ENSG00000170549
|
| 84 |
+
ENSG00000170561
|
| 85 |
+
ENSG00000170577
|
| 86 |
+
ENSG00000170689
|
| 87 |
+
ENSG00000173917
|
| 88 |
+
ENSG00000174279
|
| 89 |
+
ENSG00000174963
|
| 90 |
+
ENSG00000174963
|
| 91 |
+
ENSG00000175879
|
| 92 |
+
ENSG00000176842
|
| 93 |
+
ENSG00000177508
|
| 94 |
+
ENSG00000178573
|
| 95 |
+
ENSG00000182568
|
| 96 |
+
ENSG00000182742
|
| 97 |
+
ENSG00000185551
|
| 98 |
+
ENSG00000185551
|
| 99 |
+
ENSG00000187140
|
| 100 |
+
ENSG00000196092
|
| 101 |
+
ENSG00000197576
|
| 102 |
+
ENSG00000198807
|
| 103 |
+
ENSG00000253293
|
| 104 |
+
ENSG00000256463
|
| 105 |
+
ENSG00000260027
|
| 106 |
+
ENSG00000276644
|
| 107 |
+
ENSG00000285708
|
examples/example_input_files/bivalent_promoters/lys4_only_gene_labels.txt
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ENSG00000012048
|
| 2 |
+
ENSG00000033627
|
| 3 |
+
ENSG00000037042
|
| 4 |
+
ENSG00000055950
|
| 5 |
+
ENSG00000067596
|
| 6 |
+
ENSG00000069248
|
| 7 |
+
ENSG00000072682
|
| 8 |
+
ENSG00000085274
|
| 9 |
+
ENSG00000088035
|
| 10 |
+
ENSG00000088930
|
| 11 |
+
ENSG00000095539
|
| 12 |
+
ENSG00000102471
|
| 13 |
+
ENSG00000102967
|
| 14 |
+
ENSG00000104313
|
| 15 |
+
ENSG00000105146
|
| 16 |
+
ENSG00000105379
|
| 17 |
+
ENSG00000105982
|
| 18 |
+
ENSG00000105983
|
| 19 |
+
ENSG00000107816
|
| 20 |
+
ENSG00000107819
|
| 21 |
+
ENSG00000107829
|
| 22 |
+
ENSG00000107833
|
| 23 |
+
ENSG00000108784
|
| 24 |
+
ENSG00000108799
|
| 25 |
+
ENSG00000108828
|
| 26 |
+
ENSG00000108830
|
| 27 |
+
ENSG00000109911
|
| 28 |
+
ENSG00000113522
|
| 29 |
+
ENSG00000119487
|
| 30 |
+
ENSG00000120049
|
| 31 |
+
ENSG00000125347
|
| 32 |
+
ENSG00000126581
|
| 33 |
+
ENSG00000131374
|
| 34 |
+
ENSG00000131437
|
| 35 |
+
ENSG00000131462
|
| 36 |
+
ENSG00000131467
|
| 37 |
+
ENSG00000131469
|
| 38 |
+
ENSG00000131470
|
| 39 |
+
ENSG00000131475
|
| 40 |
+
ENSG00000131477
|
| 41 |
+
ENSG00000135272
|
| 42 |
+
ENSG00000135776
|
| 43 |
+
ENSG00000135801
|
| 44 |
+
ENSG00000136158
|
| 45 |
+
ENSG00000140262
|
| 46 |
+
ENSG00000140450
|
| 47 |
+
ENSG00000140563
|
| 48 |
+
ENSG00000140829
|
| 49 |
+
ENSG00000140830
|
| 50 |
+
ENSG00000145494
|
| 51 |
+
ENSG00000146909
|
| 52 |
+
ENSG00000147905
|
| 53 |
+
ENSG00000148688
|
| 54 |
+
ENSG00000148840
|
| 55 |
+
ENSG00000148950
|
| 56 |
+
ENSG00000151332
|
| 57 |
+
ENSG00000151338
|
| 58 |
+
ENSG00000165637
|
| 59 |
+
ENSG00000165644
|
| 60 |
+
ENSG00000166135
|
| 61 |
+
ENSG00000166136
|
| 62 |
+
ENSG00000166167
|
| 63 |
+
ENSG00000166169
|
| 64 |
+
ENSG00000166189
|
| 65 |
+
ENSG00000166197
|
| 66 |
+
ENSG00000166377
|
| 67 |
+
ENSG00000167081
|
| 68 |
+
ENSG00000168118
|
| 69 |
+
ENSG00000171421
|
| 70 |
+
ENSG00000175832
|
| 71 |
+
ENSG00000186480
|
| 72 |
+
ENSG00000187098
|
| 73 |
+
ENSG00000188554
|
| 74 |
+
ENSG00000196628
|
| 75 |
+
ENSG00000196628
|
| 76 |
+
ENSG00000198728
|
| 77 |
+
ENSG00000198728
|
| 78 |
+
ENSG00000198863
|
| 79 |
+
ENSG00000285283
|
| 80 |
+
ENSG00000285708
|
examples/example_input_files/bivalent_promoters/no_methylation_gene_labels.txt
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ENSG00000068079
|
| 2 |
+
ENSG00000068383
|
| 3 |
+
ENSG00000075290
|
| 4 |
+
ENSG00000104313
|
| 5 |
+
ENSG00000105370
|
| 6 |
+
ENSG00000105374
|
| 7 |
+
ENSG00000105383
|
| 8 |
+
ENSG00000106536
|
| 9 |
+
ENSG00000113520
|
| 10 |
+
ENSG00000113525
|
| 11 |
+
ENSG00000118557
|
| 12 |
+
ENSG00000125257
|
| 13 |
+
ENSG00000128573
|
| 14 |
+
ENSG00000131471
|
| 15 |
+
ENSG00000131480
|
| 16 |
+
ENSG00000131482
|
| 17 |
+
ENSG00000134532
|
| 18 |
+
ENSG00000136319
|
| 19 |
+
ENSG00000138792
|
| 20 |
+
ENSG00000140262
|
| 21 |
+
ENSG00000140718
|
| 22 |
+
ENSG00000147488
|
| 23 |
+
ENSG00000147488
|
| 24 |
+
ENSG00000148677
|
| 25 |
+
ENSG00000151322
|
| 26 |
+
ENSG00000151322
|
| 27 |
+
ENSG00000156113
|
| 28 |
+
ENSG00000164399
|
| 29 |
+
ENSG00000164400
|
| 30 |
+
ENSG00000167749
|
| 31 |
+
ENSG00000167754
|
| 32 |
+
ENSG00000167755
|
| 33 |
+
ENSG00000169035
|
| 34 |
+
ENSG00000170927
|
| 35 |
+
ENSG00000182177
|
| 36 |
+
ENSG00000186153
|
| 37 |
+
ENSG00000187098
|
| 38 |
+
ENSG00000204764
|
| 39 |
+
ENSG00000213022
|
| 40 |
+
ENSG00000213822
|
| 41 |
+
ENSG00000261701
|
| 42 |
+
ENSG00000285708
|
examples/example_input_files/dosage_sensitive_tfs/dosage_sens_tf_labels.csv
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dosage_sensitive,dosage_insensitive
|
| 2 |
+
ENSG00000008197,ENSG00000010539
|
| 3 |
+
ENSG00000008441,ENSG00000011590
|
| 4 |
+
ENSG00000010818,ENSG00000063438
|
| 5 |
+
ENSG00000011332,ENSG00000063587
|
| 6 |
+
ENSG00000030419,ENSG00000064218
|
| 7 |
+
ENSG00000062194,ENSG00000064489
|
| 8 |
+
ENSG00000065970,ENSG00000067646
|
| 9 |
+
ENSG00000067082,ENSG00000075407
|
| 10 |
+
ENSG00000069667,ENSG00000079263
|
| 11 |
+
ENSG00000072736,ENSG00000081386
|
| 12 |
+
ENSG00000073282,ENSG00000083812
|
| 13 |
+
ENSG00000073861,ENSG00000083814
|
| 14 |
+
ENSG00000077092,ENSG00000083828
|
| 15 |
+
ENSG00000083307,ENSG00000083838
|
| 16 |
+
ENSG00000084676,ENSG00000083844
|
| 17 |
+
ENSG00000085276,ENSG00000085644
|
| 18 |
+
ENSG00000087510,ENSG00000089335
|
| 19 |
+
ENSG00000087903,ENSG00000089775
|
| 20 |
+
ENSG00000089225,ENSG00000102901
|
| 21 |
+
ENSG00000091656,ENSG00000103199
|
| 22 |
+
ENSG00000091831,ENSG00000105136
|
| 23 |
+
ENSG00000095951,ENSG00000105610
|
| 24 |
+
ENSG00000100644,ENSG00000105672
|
| 25 |
+
ENSG00000100968,ENSG00000106410
|
| 26 |
+
ENSG00000101076,ENSG00000106948
|
| 27 |
+
ENSG00000101412,ENSG00000109705
|
| 28 |
+
ENSG00000102870,ENSG00000115568
|
| 29 |
+
ENSG00000102935,ENSG00000117010
|
| 30 |
+
ENSG00000103449,ENSG00000118620
|
| 31 |
+
ENSG00000105698,ENSG00000119574
|
| 32 |
+
ENSG00000105866,ENSG00000120669
|
| 33 |
+
ENSG00000106689,ENSG00000121406
|
| 34 |
+
ENSG00000106852,ENSG00000121864
|
| 35 |
+
ENSG00000111249,ENSG00000122085
|
| 36 |
+
ENSG00000111783,ENSG00000124203
|
| 37 |
+
ENSG00000112033,ENSG00000124232
|
| 38 |
+
ENSG00000112246,ENSG00000124444
|
| 39 |
+
ENSG00000112561,ENSG00000124613
|
| 40 |
+
ENSG00000112837,ENSG00000125520
|
| 41 |
+
ENSG00000115112,ENSG00000127081
|
| 42 |
+
ENSG00000116809,ENSG00000127903
|
| 43 |
+
ENSG00000116833,ENSG00000127989
|
| 44 |
+
ENSG00000117000,ENSG00000129028
|
| 45 |
+
ENSG00000118263,ENSG00000129071
|
| 46 |
+
ENSG00000118922,ENSG00000129194
|
| 47 |
+
ENSG00000119547,ENSG00000130544
|
| 48 |
+
ENSG00000120798,ENSG00000130818
|
| 49 |
+
ENSG00000121068,ENSG00000131848
|
| 50 |
+
ENSG00000123358,ENSG00000132010
|
| 51 |
+
ENSG00000123411,ENSG00000132846
|
| 52 |
+
ENSG00000124496,ENSG00000133250
|
| 53 |
+
ENSG00000124813,ENSG00000134874
|
| 54 |
+
ENSG00000125398,ENSG00000135899
|
| 55 |
+
ENSG00000125618,ENSG00000136866
|
| 56 |
+
ENSG00000126368,ENSG00000137185
|
| 57 |
+
ENSG00000127152,ENSG00000137504
|
| 58 |
+
ENSG00000128573,ENSG00000138380
|
| 59 |
+
ENSG00000129173,ENSG00000140993
|
| 60 |
+
ENSG00000131759,ENSG00000141946
|
| 61 |
+
ENSG00000132005,ENSG00000142556
|
| 62 |
+
ENSG00000133794,ENSG00000143067
|
| 63 |
+
ENSG00000134046,ENSG00000144026
|
| 64 |
+
ENSG00000134317,ENSG00000144161
|
| 65 |
+
ENSG00000134323,ENSG00000145908
|
| 66 |
+
ENSG00000134852,ENSG00000146587
|
| 67 |
+
ENSG00000135111,ENSG00000147183
|
| 68 |
+
ENSG00000137203,ENSG00000147789
|
| 69 |
+
ENSG00000137270,ENSG00000148300
|
| 70 |
+
ENSG00000138795,ENSG00000149054
|
| 71 |
+
ENSG00000139083,ENSG00000149922
|
| 72 |
+
ENSG00000139793,ENSG00000151500
|
| 73 |
+
ENSG00000140548,ENSG00000151650
|
| 74 |
+
ENSG00000140968,ENSG00000151657
|
| 75 |
+
ENSG00000142611,ENSG00000152439
|
| 76 |
+
ENSG00000143033,ENSG00000152467
|
| 77 |
+
ENSG00000143171,ENSG00000152475
|
| 78 |
+
ENSG00000143190,ENSG00000153975
|
| 79 |
+
ENSG00000143355,ENSG00000155592
|
| 80 |
+
ENSG00000143365,ENSG00000156469
|
| 81 |
+
ENSG00000143373,ENSG00000157429
|
| 82 |
+
ENSG00000143437,ENSG00000159882
|
| 83 |
+
ENSG00000144355,ENSG00000159885
|
| 84 |
+
ENSG00000147862,ENSG00000159915
|
| 85 |
+
ENSG00000148516,ENSG00000160224
|
| 86 |
+
ENSG00000150907,ENSG00000160229
|
| 87 |
+
ENSG00000151090,ENSG00000160352
|
| 88 |
+
ENSG00000153234,ENSG00000160908
|
| 89 |
+
ENSG00000158055,ENSG00000160961
|
| 90 |
+
ENSG00000160007,ENSG00000161277
|
| 91 |
+
ENSG00000160094,ENSG00000162086
|
| 92 |
+
ENSG00000161405,ENSG00000163516
|
| 93 |
+
ENSG00000162761,ENSG00000164011
|
| 94 |
+
ENSG00000162924,ENSG00000164048
|
| 95 |
+
ENSG00000164683,ENSG00000164296
|
| 96 |
+
ENSG00000164684,ENSG00000164299
|
| 97 |
+
ENSG00000167182,ENSG00000165066
|
| 98 |
+
ENSG00000168610,ENSG00000165512
|
| 99 |
+
ENSG00000168916,ENSG00000165643
|
| 100 |
+
ENSG00000169554,ENSG00000165684
|
| 101 |
+
ENSG00000169946,ENSG00000166529
|
| 102 |
+
ENSG00000170370,ENSG00000166823
|
| 103 |
+
ENSG00000172733,ENSG00000166860
|
| 104 |
+
ENSG00000172819,ENSG00000167034
|
| 105 |
+
ENSG00000177463,ENSG00000167384
|
| 106 |
+
ENSG00000178177,ENSG00000167554
|
| 107 |
+
ENSG00000179348,ENSG00000167625
|
| 108 |
+
ENSG00000179361,ENSG00000167785
|
| 109 |
+
ENSG00000179456,ENSG00000167800
|
| 110 |
+
ENSG00000180357,ENSG00000167840
|
| 111 |
+
ENSG00000185551,ENSG00000167962
|
| 112 |
+
ENSG00000185591,ENSG00000167981
|
| 113 |
+
ENSG00000187098,ENSG00000168152
|
| 114 |
+
ENSG00000187605,ENSG00000168286
|
| 115 |
+
ENSG00000189308,ENSG00000168769
|
| 116 |
+
ENSG00000196092,ENSG00000169131
|
| 117 |
+
ENSG00000196482,ENSG00000169136
|
| 118 |
+
ENSG00000196628,ENSG00000169548
|
| 119 |
+
ENSG00000197757,ENSG00000169951
|
| 120 |
+
ENSG00000198815,ENSG00000169955
|
| 121 |
+
ENSG00000198945,ENSG00000169989
|
| 122 |
+
ENSG00000198963,ENSG00000170260
|
| 123 |
+
ENSG00000204231,ENSG00000170608
|
| 124 |
+
,ENSG00000170954
|
| 125 |
+
,ENSG00000171291
|
| 126 |
+
,ENSG00000171295
|
| 127 |
+
,ENSG00000171425
|
| 128 |
+
,ENSG00000171443
|
| 129 |
+
,ENSG00000171466
|
| 130 |
+
,ENSG00000171469
|
| 131 |
+
,ENSG00000171574
|
| 132 |
+
,ENSG00000171606
|
| 133 |
+
,ENSG00000171827
|
| 134 |
+
,ENSG00000171872
|
| 135 |
+
,ENSG00000171970
|
| 136 |
+
,ENSG00000172000
|
| 137 |
+
,ENSG00000172888
|
| 138 |
+
,ENSG00000173041
|
| 139 |
+
,ENSG00000173258
|
| 140 |
+
,ENSG00000173480
|
| 141 |
+
,ENSG00000173673
|
| 142 |
+
,ENSG00000173825
|
| 143 |
+
,ENSG00000174255
|
| 144 |
+
,ENSG00000174652
|
| 145 |
+
,ENSG00000174796
|
| 146 |
+
,ENSG00000175279
|
| 147 |
+
,ENSG00000175325
|
| 148 |
+
,ENSG00000175395
|
| 149 |
+
,ENSG00000175691
|
| 150 |
+
,ENSG00000176009
|
| 151 |
+
,ENSG00000176024
|
| 152 |
+
,ENSG00000176083
|
| 153 |
+
,ENSG00000176222
|
| 154 |
+
,ENSG00000176302
|
| 155 |
+
,ENSG00000176472
|
| 156 |
+
,ENSG00000176678
|
| 157 |
+
,ENSG00000176679
|
| 158 |
+
,ENSG00000177030
|
| 159 |
+
,ENSG00000177494
|
| 160 |
+
,ENSG00000177599
|
| 161 |
+
,ENSG00000177683
|
| 162 |
+
,ENSG00000177842
|
| 163 |
+
,ENSG00000177873
|
| 164 |
+
,ENSG00000177932
|
| 165 |
+
,ENSG00000177946
|
| 166 |
+
,ENSG00000178150
|
| 167 |
+
,ENSG00000178229
|
| 168 |
+
,ENSG00000178338
|
| 169 |
+
,ENSG00000178386
|
| 170 |
+
,ENSG00000178665
|
| 171 |
+
,ENSG00000178917
|
| 172 |
+
,ENSG00000178928
|
| 173 |
+
,ENSG00000178935
|
| 174 |
+
,ENSG00000179195
|
| 175 |
+
,ENSG00000179772
|
| 176 |
+
,ENSG00000179774
|
| 177 |
+
,ENSG00000179886
|
| 178 |
+
,ENSG00000179909
|
| 179 |
+
,ENSG00000179922
|
| 180 |
+
,ENSG00000179930
|
| 181 |
+
,ENSG00000179943
|
| 182 |
+
,ENSG00000179965
|
| 183 |
+
,ENSG00000180257
|
| 184 |
+
,ENSG00000180346
|
| 185 |
+
,ENSG00000180532
|
| 186 |
+
,ENSG00000180535
|
| 187 |
+
,ENSG00000180938
|
| 188 |
+
,ENSG00000181135
|
| 189 |
+
,ENSG00000181444
|
| 190 |
+
,ENSG00000181450
|
| 191 |
+
,ENSG00000181638
|
| 192 |
+
,ENSG00000181894
|
| 193 |
+
,ENSG00000181896
|
| 194 |
+
,ENSG00000182318
|
| 195 |
+
,ENSG00000182983
|
| 196 |
+
,ENSG00000182986
|
| 197 |
+
,ENSG00000183340
|
| 198 |
+
,ENSG00000183647
|
| 199 |
+
,ENSG00000183734
|
| 200 |
+
,ENSG00000183850
|
| 201 |
+
,ENSG00000184221
|
| 202 |
+
,ENSG00000184517
|
| 203 |
+
,ENSG00000184635
|
| 204 |
+
,ENSG00000184677
|
| 205 |
+
,ENSG00000184895
|
| 206 |
+
,ENSG00000185155
|
| 207 |
+
,ENSG00000185252
|
| 208 |
+
,ENSG00000185404
|
| 209 |
+
,ENSG00000185730
|
| 210 |
+
,ENSG00000186020
|
| 211 |
+
,ENSG00000186026
|
| 212 |
+
,ENSG00000186051
|
| 213 |
+
,ENSG00000186103
|
| 214 |
+
,ENSG00000186230
|
| 215 |
+
,ENSG00000186300
|
| 216 |
+
,ENSG00000186376
|
| 217 |
+
,ENSG00000186446
|
| 218 |
+
,ENSG00000186496
|
| 219 |
+
,ENSG00000186777
|
| 220 |
+
,ENSG00000186812
|
| 221 |
+
,ENSG00000186814
|
| 222 |
+
,ENSG00000187626
|
| 223 |
+
,ENSG00000187801
|
| 224 |
+
,ENSG00000187821
|
| 225 |
+
,ENSG00000187855
|
| 226 |
+
,ENSG00000187987
|
| 227 |
+
,ENSG00000188033
|
| 228 |
+
,ENSG00000188095
|
| 229 |
+
,ENSG00000188171
|
| 230 |
+
,ENSG00000188295
|
| 231 |
+
,ENSG00000188321
|
| 232 |
+
,ENSG00000188629
|
| 233 |
+
,ENSG00000188785
|
| 234 |
+
,ENSG00000188868
|
| 235 |
+
,ENSG00000189164
|
| 236 |
+
,ENSG00000189190
|
| 237 |
+
,ENSG00000189298
|
| 238 |
+
,ENSG00000189299
|
| 239 |
+
,ENSG00000196152
|
| 240 |
+
,ENSG00000196172
|
| 241 |
+
,ENSG00000196214
|
| 242 |
+
,ENSG00000196345
|
| 243 |
+
,ENSG00000196357
|
| 244 |
+
,ENSG00000196378
|
| 245 |
+
,ENSG00000196381
|
| 246 |
+
,ENSG00000196387
|
| 247 |
+
,ENSG00000196391
|
| 248 |
+
,ENSG00000196417
|
| 249 |
+
,ENSG00000196418
|
| 250 |
+
,ENSG00000196456
|
| 251 |
+
,ENSG00000196460
|
| 252 |
+
,ENSG00000196466
|
| 253 |
+
,ENSG00000196605
|
| 254 |
+
,ENSG00000196646
|
| 255 |
+
,ENSG00000196652
|
| 256 |
+
,ENSG00000196670
|
| 257 |
+
,ENSG00000196693
|
| 258 |
+
,ENSG00000196705
|
| 259 |
+
,ENSG00000196812
|
| 260 |
+
,ENSG00000196946
|
| 261 |
+
,ENSG00000197008
|
| 262 |
+
,ENSG00000197020
|
| 263 |
+
,ENSG00000197037
|
| 264 |
+
,ENSG00000197044
|
| 265 |
+
,ENSG00000197054
|
| 266 |
+
,ENSG00000197124
|
| 267 |
+
,ENSG00000197134
|
| 268 |
+
,ENSG00000197162
|
| 269 |
+
,ENSG00000197213
|
| 270 |
+
,ENSG00000197279
|
| 271 |
+
,ENSG00000197343
|
| 272 |
+
,ENSG00000197360
|
| 273 |
+
,ENSG00000197363
|
| 274 |
+
,ENSG00000197472
|
| 275 |
+
,ENSG00000197779
|
| 276 |
+
,ENSG00000197841
|
| 277 |
+
,ENSG00000197857
|
| 278 |
+
,ENSG00000197863
|
| 279 |
+
,ENSG00000197928
|
| 280 |
+
,ENSG00000197933
|
| 281 |
+
,ENSG00000197951
|
| 282 |
+
,ENSG00000198028
|
| 283 |
+
,ENSG00000198039
|
| 284 |
+
,ENSG00000198046
|
| 285 |
+
,ENSG00000198185
|
| 286 |
+
,ENSG00000198205
|
| 287 |
+
,ENSG00000198300
|
| 288 |
+
,ENSG00000198315
|
| 289 |
+
,ENSG00000198342
|
| 290 |
+
,ENSG00000198346
|
| 291 |
+
,ENSG00000198429
|
| 292 |
+
,ENSG00000198440
|
| 293 |
+
,ENSG00000198464
|
| 294 |
+
,ENSG00000198466
|
| 295 |
+
,ENSG00000198482
|
| 296 |
+
,ENSG00000198538
|
| 297 |
+
,ENSG00000198546
|
| 298 |
+
,ENSG00000198551
|
| 299 |
+
,ENSG00000198556
|
| 300 |
+
,ENSG00000198633
|
| 301 |
+
,ENSG00000198939
|
| 302 |
+
,ENSG00000203326
|
| 303 |
+
,ENSG00000204514
|
| 304 |
+
,ENSG00000204519
|
| 305 |
+
,ENSG00000204532
|
| 306 |
+
,ENSG00000204595
|
| 307 |
+
,ENSG00000204604
|
| 308 |
+
,ENSG00000204644
|
| 309 |
+
,ENSG00000204946
|
| 310 |
+
,ENSG00000213020
|
| 311 |
+
,ENSG00000213799
|
| 312 |
+
,ENSG00000213973
|
| 313 |
+
,ENSG00000213988
|
| 314 |
+
,ENSG00000214189
|
| 315 |
+
,ENSG00000215271
|
| 316 |
+
,ENSG00000215372
|
| 317 |
+
,ENSG00000215612
|
| 318 |
+
,ENSG00000220201
|
| 319 |
+
,ENSG00000221923
|
| 320 |
+
,ENSG00000223547
|
| 321 |
+
,ENSG00000227124
|
| 322 |
+
,ENSG00000229676
|
| 323 |
+
,ENSG00000229809
|
| 324 |
+
,ENSG00000230797
|
| 325 |
+
,ENSG00000232040
|
| 326 |
+
,ENSG00000234284
|
| 327 |
+
,ENSG00000234444
|
| 328 |
+
,ENSG00000235109
|
| 329 |
+
,ENSG00000235608
|
| 330 |
+
,ENSG00000236104
|
| 331 |
+
,ENSG00000236609
|
| 332 |
+
,ENSG00000237440
|
| 333 |
+
,ENSG00000242852
|
| 334 |
+
,ENSG00000243660
|
| 335 |
+
,ENSG00000245680
|
| 336 |
+
,ENSG00000248483
|
| 337 |
+
,ENSG00000249459
|
| 338 |
+
,ENSG00000249471
|
| 339 |
+
,ENSG00000249709
|
| 340 |
+
,ENSG00000250571
|
| 341 |
+
,ENSG00000250709
|
| 342 |
+
,ENSG00000251192
|
| 343 |
+
,ENSG00000251247
|
| 344 |
+
,ENSG00000251369
|
| 345 |
+
,ENSG00000253831
|
| 346 |
+
,ENSG00000254004
|
| 347 |
+
,ENSG00000256087
|
| 348 |
+
,ENSG00000256223
|
| 349 |
+
,ENSG00000256229
|
| 350 |
+
,ENSG00000256294
|
| 351 |
+
,ENSG00000256463
|
| 352 |
+
,ENSG00000256683
|
| 353 |
+
,ENSG00000256771
|
| 354 |
+
,ENSG00000257446
|
| 355 |
+
,ENSG00000257591
|
| 356 |
+
,ENSG00000258405
|
| 357 |
+
,ENSG00000258873
|
| 358 |
+
,ENSG00000263002
|
| 359 |
+
,ENSG00000264668
|
| 360 |
+
,ENSG00000265763
|
| 361 |
+
,ENSG00000267041
|
| 362 |
+
,ENSG00000267179
|
| 363 |
+
,ENSG00000267281
|
| 364 |
+
,ENSG00000267508
|
| 365 |
+
,ENSG00000267680
|
| 366 |
+
,ENSG00000269067
|
| 367 |
+
,ENSG00000269343
|
| 368 |
+
,ENSG00000269699
|
| 369 |
+
,ENSG00000272602
|
examples/example_input_files/gene_info_table.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/extract_and_plot_cell_embeddings.ipynb
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/gene_classification.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|